mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-24 18:42:32 +02:00 
			
		
		
		
	Compare commits
	
		
			484 Commits
		
	
	
		
			v2.4.5
			...
			fix/exerci
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | dd2fedf1a2 | ||
|   | 77a23265a9 | ||
|   | 4c230ef6dd | ||
|   | b305e1ce23 | ||
|   | bdf17f5c87 | ||
|   | 77fce7daf8 | ||
|   | 653bf139f0 | ||
|   | 3f60638d33 | ||
|   | b97b6fae6b | ||
|   | 477234ad0d | ||
|   | 63f25277b0 | ||
|   | c8eff04ae0 | ||
|   | edc482cdf4 | ||
|   | 72cd0f77e2 | ||
|   | be175f9347 | ||
|   | ba2833dba5 | ||
|   | 2f0e792670 | ||
|   | 5f88539f7e | ||
|   | bd9d7efe64 | ||
|   | 16a2dd5b15 | ||
|   | 678283d341 | ||
|   | 287173b0b1 | ||
|   | 712217e959 | ||
|   | 6dda4c55a8 | ||
|   | 596b6a7688 | ||
|   | 5983200247 | ||
|   | 26e802d88b | ||
|   | f5c4e82816 | ||
|   | f5273f7ca0 | ||
|   | fa71a9f44f | ||
|   | 81d6ff53c4 | ||
|   | d7a2b6e019 | ||
|   | 71c65e89d1 | ||
|   | c1046498e7 | ||
|   | 8fbd1978af | ||
|   | 739dd95850 | ||
|   | c54c3bcfa1 | ||
|   | d7f2229978 | ||
|   | 52fdeae752 | ||
|   | f9bb2e41cf | ||
|   | 4f9e2ab48d | ||
|   | 19beb8f07b | ||
|   | c897d9e2f5 | ||
|   | 21a266e302 | ||
|   | b29b6f93f8 | ||
|   | 318226d7cb | ||
|   | 422cf05f15 | ||
|   | 819c6673c7 | ||
|   | 89b44c69a7 | ||
|   | 4b4f72b2ca | ||
|   | 778517d8c6 | ||
|   | 428b0179fc | ||
|   | ade6309dd9 | ||
|   | fd6cb7b966 | ||
|   | 5c87517ceb | ||
|   | b01f093474 | ||
|   | 3a05b90525 | ||
|   | 7a00f73e0e | ||
|   | 5d0621420e | ||
|   | df98153169 | ||
|   | fc1f68ccd9 | ||
|   | 3e831c7e23 | ||
|   | bbcfe9c8dd | ||
|   | eb01aa86cb | ||
|   | 3db186a978 | ||
|   | 4a5959fd58 | ||
|   | 1cbc2b717a | ||
|   | da627ff929 | ||
|   | c1b592ac29 | ||
|   | eb0c956d32 | ||
|   | ab0cb2d956 | ||
|   | a117126389 | ||
|   | e9f8901520 | ||
|   | 266812f90e | ||
|   | 533bc27439 | ||
|   | 0113a0ca10 | ||
|   | 40f8a05ad6 | ||
|   | 50b50513c6 | ||
|   | df3514cd03 | ||
|   | ad53185247 | ||
|   | 87b67e9271 | ||
|   | b54b3b979c | ||
|   | 2184ac8040 | ||
|   | b3d412360b | ||
|   | dbc2553b11 | ||
|   | 68c398f1fe | ||
|   | 123a57beec | ||
|   | d204dac8ce | ||
|   | 443f7fe839 | ||
|   | 0294ceb7d5 | ||
|   | 6f30c6583d | ||
|   | 467fc526e8 | ||
|   | 722d2eb393 | ||
|   | 6d44aac278 | ||
|   | 55a2de6b88 | ||
|   | c0d6d8b229 | ||
|   | 635caa765d | ||
|   | e69b55b349 | ||
|   | 07200bbde5 | ||
|   | c020cccc64 | ||
|   | 259cfc20cc | ||
|   | 37b51a66d8 | ||
|   | f47d2f11d8 | ||
|   | 1b6be6bd79 | ||
|   | e1430e6298 | ||
|   | 5fdd40204b | ||
|   | fb4631ba18 | ||
|   | d72fc2760b | ||
|   | 4a51aaa4f5 | ||
|   | 66a5b1ba02 | ||
|   | aa5a3a10bc | ||
|   | d9b111cec2 | ||
|   | 345f52a1f6 | ||
|   | ed24366aba | ||
|   | 46fb782798 | ||
|   | 846c29aee1 | ||
|   | a5015fe9b1 | ||
|   | 616b0480f7 | ||
|   | 2f0e04ce13 | ||
|   | bcc537468c | ||
|   | 694ffb4d77 | ||
|   | af2cc1169a | ||
|   | bc3fa36637 | ||
|   | afbd03f777 | ||
|   | b8fe25c580 | ||
|   | a241672726 | ||
|   | a8f76e9be7 | ||
|   | b56475450d | ||
|   | aa74604d29 | ||
|   | d2e6d91880 | ||
|   | 602044ff1b | ||
|   | 31631fb409 | ||
|   | 00db348218 | ||
|   | a709280cbf | ||
|   | a99ddaa0cc | ||
|   | ba3d299c05 | ||
|   | 07a21f80a6 | ||
|   | f17b9b68f4 | ||
|   | a2831fbea2 | ||
|   | da72863b47 | ||
|   | 86e2e226dc | ||
|   | 7872fe5221 | ||
|   | 86947e4874 | ||
|   | 4f022e2d19 | ||
|   | f47e7374d2 | ||
|   | 57ec51e95a | ||
|   | 0045124a4e | ||
|   | 9618aae83b | ||
|   | 33453ede2d | ||
|   | e467b38d73 | ||
|   | e9d2d05030 | ||
|   | 4bf0c972e6 | ||
|   | 4ee919625d | ||
|   | d30f25ee97 | ||
|   | 10d9d74528 | ||
|   | 43c5453e10 | ||
|   | eb4de8ae0c | ||
|   | e32c1f000f | ||
|   | 5f527bc697 | ||
|   | ced8b9a2d0 | ||
|   | 6f3cfd4396 | ||
|   | 462d993fbc | ||
|   | a99356f2a2 | ||
|   | eac2e34161 | ||
|   | a82a0b19c2 | ||
|   | 90cb6e989b | ||
|   | 6289938d7c | ||
|   | 13b8c3d9c6 | ||
|   | 88afe64a92 | ||
|   | 6b2a657573 | ||
|   | d6f38a61e1 | ||
|   | ad3f4955f7 | ||
|   | e42ab83d32 | ||
|   | f9a3f9b9f2 | ||
|   | ef7d5ea2d3 | ||
|   | 55ea304ff3 | ||
|   | fee12b3d9e | ||
|   | 6673077397 | ||
|   | 742632ed8d | ||
|   | 544d45cbc5 | ||
|   | 86f79ff1f1 | ||
|   | ee67f9f472 | ||
|   | 8ec3f41251 | ||
|   | 89be07d4d3 | ||
|   | 91200f3684 | ||
|   | 9ffd603357 | ||
|   | 80eeb8fe97 | ||
|   | 75fde870c2 | ||
|   | 6e4d423c81 | ||
|   | 57aef26217 | ||
|   | 70ec64a48b | ||
|   | 70b33ecfd9 | ||
|   | 601e4b936b | ||
|   | a292c4c437 | ||
|   | bc65ea7ab6 | ||
|   | f28bbe6b0c | ||
|   | 61d902d715 | ||
|   | 8ab462fb87 | ||
|   | df3ad3d890 | ||
|   | fc31100a0f | ||
|   | 31b6311e99 | ||
|   | 1fc8e9eb7a | ||
|   | 85b9f45085 | ||
|   | f656e3ff34 | ||
|   | e1bda94329 | ||
|   | f6b26f4ead | ||
|   | 722970a255 | ||
|   | f40820c41f | ||
|   | 49ad1b6e46 | ||
|   | 1ce32d2f18 | ||
|   | 9d5ec84b91 | ||
|   | 1fba96abcb | ||
|   | 921cec7ddc | ||
|   | 7b062883f6 | ||
|   | 64a2960751 | ||
|   | 17879a7f69 | ||
|   | 1dd24551a5 | ||
|   | 84f775013f | ||
|   | b78eb64f3d | ||
|   | d65efed561 | ||
|   | 1ca6740e05 | ||
|   | 474aa7e1cc | ||
|   | 5beb4d9a2d | ||
|   | 19eed5bdff | ||
|   | 6fa9cfd4c3 | ||
|   | 80acc4b50d | ||
|   | 2c72a9112c | ||
|   | 17207546e9 | ||
|   | 533f75ea71 | ||
|   | adb5d4ade3 | ||
|   | a879c6ab6e | ||
|   | 915e42fd07 | ||
|   | 2d8dcc87ff | ||
|   | 66f0e398a1 | ||
|   | 30be4e29fa | ||
|   | 263780e6a3 | ||
|   | 07a75a37c3 | ||
|   | f85b75df8c | ||
|   | 6644126b5d | ||
|   | c665c36d88 | ||
|   | 519a7ef435 | ||
|   | a848194601 | ||
|   | aabce764ac | ||
|   | 5a331663e4 | ||
|   | 40144f8bd8 | ||
|   | f68849c65f | ||
|   | edb52a989e | ||
|   | 980578d05a | ||
|   | 486699cef3 | ||
|   | 0096a0c077 | ||
|   | d905e95dbb | ||
|   | 61430c8739 | ||
|   | eb8b915813 | ||
|   | 22c2259adb | ||
|   | c15a1aecdf | ||
|   | 16d50b6626 | ||
|   | 651b087932 | ||
|   | bce3dc384d | ||
|   | c21ddf225b | ||
|   | 4fefb98d71 | ||
|   | ffda4e43df | ||
|   | 69cb2a7734 | ||
|   | c33de233dc | ||
|   | 85f89a7ff3 | ||
|   | 9ce20216b5 | ||
|   | 1739c54091 | ||
|   | d8bd1f518a | ||
|   | 86ba47541b | ||
|   | 492ec6a932 | ||
|   | 342076ee0e | ||
|   | d44f6966c2 | ||
|   | 5c76193045 | ||
|   | 1c1f781be4 | ||
|   | c687d4a51a | ||
|   | fca62541ca | ||
|   | 3ab3581f84 | ||
|   | 8dd0689420 | ||
|   | be4b1040f8 | ||
|   | 79be6e1dc5 | ||
|   | edbd92dbbf | ||
|   | 27b5a8e490 | ||
|   | 1f400d5964 | ||
|   | 0ca0680165 | ||
|   | ce1dbda5b4 | ||
|   | 9cce78669f | ||
|   | 6ca0ecdf05 | ||
|   | 6e9f8fd391 | ||
|   | 2fdf24495b | ||
|   | bbf9f8f130 | ||
|   | 37f8d84a9c | ||
|   | 5edd868d5b | ||
|   | e4e5e83be6 | ||
|   | 74c7b39dc8 | ||
|   | 445dffc987 | ||
|   | d97d6bf147 | ||
|   | 79efdb56f7 | ||
|   | a9af56a5e9 | ||
|   | 59f13bb8d6 | ||
|   | 463f8830d7 | ||
|   | 05ad06fbc1 | ||
|   | 29d5a40c57 | ||
|   | c0cecf8363 | ||
|   | b998339002 | ||
|   | 245c9c3dcc | ||
|   | d8f26a789e | ||
|   | e1d18708b3 | ||
|   | b44b49476d | ||
|   | 7e0bb06259 | ||
|   | ecdedfa1cf | ||
|   | 3d4b997d4a | ||
|   | e81005ae4b | ||
|   | 33a81a5f5c | ||
|   | 25e2abdb03 | ||
|   | 803e5628a2 | ||
|   | c88f20859a | ||
|   | ec3767c545 | ||
|   | 729ff0a4c7 | ||
|   | 6fe51e258f | ||
|   | 44ecb2fbe7 | ||
|   | 53e031d9f6 | ||
|   | 8ac85ea0bd | ||
|   | adfdc302d7 | ||
|   | 3053278721 | ||
|   | 4d07de0d71 | ||
|   | 953a1bba93 | ||
|   | e724ff7c93 | ||
|   | 62f0f7bfc5 | ||
|   | 9cb2b68f09 | ||
|   | 1bbc0b705f | ||
|   | 662191eca9 | ||
|   | 8fad8edc1e | ||
|   | ae3d80664c | ||
|   | e21795ee35 | ||
|   | ec95dda18f | ||
|   | 098ac45758 | ||
|   | 9889ce6b57 | ||
|   | b4d97cd545 | ||
|   | afac22c562 | ||
|   | 552cd82802 | ||
|   | dfde0e2310 | ||
|   | 54dd2f8337 | ||
|   | b5785f260e | ||
|   | 98b8ca31fa | ||
|   | 4b104b6252 | ||
|   | 83d12fcf2d | ||
|   | e4f9560655 | ||
|   | 8cfa818f04 | ||
|   | 81301f3a76 | ||
|   | 2976b4d352 | ||
|   | 9f03702e69 | ||
|   | 3300886120 | ||
|   | 0d10752b5a | ||
|   | 92886fb8d8 | ||
|   | 5916626399 | ||
|   | a7c025fd86 | ||
|   | b7a999bc2e | ||
|   | 3851065500 | ||
|   | 4b68fa771f | ||
|   | 1525aa15a6 | ||
|   | db1219d4a9 | ||
|   | b8efcc2ca5 | ||
|   | 0bae009189 | ||
|   | 3efec53f51 | ||
|   | 8b76ebb3ef | ||
|   | 467ea3a37e | ||
|   | 2b6235dc78 | ||
|   | cd5aa61834 | ||
|   | 5ccb17622e | ||
|   | 1c226c31aa | ||
|   | 9ec0d3e16a | ||
|   | cf6903d109 | ||
|   | 9fd356d290 | ||
|   | 989032fe0c | ||
|   | 05573ccc53 | ||
|   | c454fabc9d | ||
|   | 7d323ec62b | ||
|   | c7494e32ce | ||
|   | 1123c8884d | ||
|   | e1104f888d | ||
|   | 8c32da7f19 | ||
|   | d63494908d | ||
|   | b70b62cef5 | ||
|   | 868f486922 | ||
|   | b2a2b5999b | ||
|   | 595de88d96 | ||
|   | a6fdf05ee9 | ||
|   | f897d7c2e1 | ||
|   | b0f731bf84 | ||
|   | 302b8c0c34 | ||
|   | acd674f0a0 | ||
|   | b0f9e1e8b4 | ||
|   | ed2e19a150 | ||
|   | 296a169dd3 | ||
|   | 1591cb9197 | ||
|   | 0c9167512c | ||
|   | a673ab0fae | ||
|   | 6e5fdf4e9e | ||
|   | 93a5a94dab | ||
|   | d565df27b3 | ||
|   | 961f40f9a1 | ||
|   | e3ee4e515d | ||
|   | 94d6a01cca | ||
|   | 38bb66a776 | ||
|   | 68781a88ab | ||
|   | 910462bb72 | ||
|   | 6bd6adb977 | ||
|   | 0acdee15a0 | ||
|   | c3ce6bb31c | ||
|   | 0459ed093e | ||
|   | d5f29f01c5 | ||
|   | 595ba8b7ab | ||
|   | cec0a8e1fc | ||
|   | f9b2fd60e2 | ||
|   | 60cd9873bc | ||
|   | 273d56c39a | ||
|   | 5497dd2827 | ||
|   | bbfdadc463 | ||
|   | fde811ae5a | ||
|   | 07e831218e | ||
|   | 91c33596da | ||
|   | a8dcf941b9 | ||
|   | e7a51decb0 | ||
|   | 9ec19be113 | ||
|   | f776186480 | ||
|   | 0096d83387 | ||
|   | 20a24dbcbf | ||
|   | 502654d853 | ||
|   | d2103d7c44 | ||
|   | d96a361325 | ||
|   | 2e85d26b6b | ||
|   | 6431a3fb3d | ||
|   | ac3bfd7388 | ||
|   | 3ea86d18a0 | ||
|   | bbc792f9fb | ||
|   | 7e127cd5cc | ||
|   | c4fb92c658 | ||
|   | 8da1ac6cee | ||
|   | a18db57e6f | ||
|   | b915e393dd | ||
|   | 3a74c23d09 | ||
|   | fbebc46c58 | ||
|   | 5595a908d8 | ||
|   | 27e4abcfa3 | ||
|   | c1ab7485e2 | ||
|   | 29cd5d1a3c | ||
|   | 6d5d9333ad | ||
|   | 7cc40595dc | ||
|   | 80ae5ddfaa | ||
|   | 4f480d117e | ||
|   | 1f2af3a290 | ||
|   | 14cdfb6a69 | ||
|   | e2bf84392b | ||
|   | 946b7a7931 | ||
|   | 9a9018751e | ||
|   | 83b75e8254 | ||
|   | 35c3fa205d | ||
|   | 0b606f02fa | ||
|   | fb78a6e98e | ||
|   | 5de68a0400 | ||
|   | f0562049b6 | ||
|   | 0e1077bb50 | ||
|   | c978e9edf4 | ||
|   | 2714ac6be6 | ||
|   | 9b048a9cfc | ||
|   | 1c2b6bf994 | ||
|   | ee39aaf08b | ||
|   | 93e6329901 | ||
|   | f47b137b59 | ||
|   | 83ea15ee83 | ||
|   | 75471c46d1 | ||
|   | 1e0343bba6 | ||
|   | 0f5e55648b | ||
|   | 57259e21f4 | ||
|   | 4ce385b262 | ||
|   | 2d64409542 | ||
|   | fcb3884a8f | ||
|   | 9f6dc56a7b | ||
|   | 56ab473611 | ||
|   | 6426060804 | ||
|   | 49a0ca7a7c | ||
|   | f3a4663491 | ||
|   | ecdbca8fb6 | ||
|   | 9cbea5fe06 | ||
|   | ba3c7f85fa | 
							
								
								
									
										10
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | |||||||
|  | version: 2 | ||||||
|  | updates: | ||||||
|  |   - package-ecosystem: github-actions | ||||||
|  |     directory: / | ||||||
|  |     schedule: | ||||||
|  |       interval: monthly | ||||||
|  |     groups: | ||||||
|  |       gh-actions: | ||||||
|  |         patterns: | ||||||
|  |           - "*" | ||||||
							
								
								
									
										85
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,85 @@ | |||||||
|  | name: build-and-release | ||||||
|  |  | ||||||
|  | on: [push, pull_request] | ||||||
|  |  | ||||||
|  | defaults: | ||||||
|  |   run: | ||||||
|  |     shell: bash | ||||||
|  |  | ||||||
|  | jobs: | ||||||
|  |  | ||||||
|  |   build: | ||||||
|  |     runs-on: ${{ matrix.os }} | ||||||
|  |     strategy: | ||||||
|  |       fail-fast: false | ||||||
|  |       matrix: | ||||||
|  |         os: [ubuntu-latest, windows-latest, macos-13, macos-latest] | ||||||
|  |         python: ["3.11"] | ||||||
|  |     steps: | ||||||
|  |       - uses: actions/checkout@v4 | ||||||
|  |  | ||||||
|  |       - uses: actions/setup-python@v5 | ||||||
|  |         with: | ||||||
|  |           python-version: ${{ matrix.python }} | ||||||
|  |  | ||||||
|  |       - name: Set up project | ||||||
|  |         if: matrix.os != 'windows-latest' | ||||||
|  |         run: ./scripts/setup | ||||||
|  |  | ||||||
|  |       - name: Set up project on windows | ||||||
|  |         if: matrix.os == 'windows-latest' | ||||||
|  |         # For some reason, `pip install --upgrade pip` doesn't work on | ||||||
|  |         # 'windows-latest'. The installed pip version works fine however. | ||||||
|  |         run: ./scripts/setup --no-pip | ||||||
|  |  | ||||||
|  |       - name: Run checks | ||||||
|  |         run: | | ||||||
|  |           ./scripts/check | ||||||
|  |           ./scripts/format | ||||||
|  |  | ||||||
|  |       - name: Assert no changes | ||||||
|  |         run: git diff --exit-code | ||||||
|  |  | ||||||
|  |       - name: Build | ||||||
|  |         run: ./scripts/build | ||||||
|  |  | ||||||
|  |       - name: Rename binary | ||||||
|  |         # Glob in source location because on windows pyinstaller creates a file | ||||||
|  |         # named "pferd.exe" | ||||||
|  |         run: mv dist/pferd* dist/pferd-${{ matrix.os }} | ||||||
|  |  | ||||||
|  |       - name: Upload binary | ||||||
|  |         uses: actions/upload-artifact@v4 | ||||||
|  |         with: | ||||||
|  |           name: pferd-${{ matrix.os }} | ||||||
|  |           path: dist/pferd-${{ matrix.os }} | ||||||
|  |  | ||||||
|  |   release: | ||||||
|  |     runs-on: ubuntu-latest | ||||||
|  |     if: startsWith(github.ref, 'refs/tags/v') | ||||||
|  |     needs: build | ||||||
|  |     steps: | ||||||
|  |  | ||||||
|  |       - name: Download binaries | ||||||
|  |         uses: actions/download-artifact@v4 | ||||||
|  |         with: | ||||||
|  |           pattern: pferd-* | ||||||
|  |           merge-multiple: true | ||||||
|  |  | ||||||
|  |       - name: Rename binaries | ||||||
|  |         run: | | ||||||
|  |           mv pferd-ubuntu-latest pferd-linux | ||||||
|  |           mv pferd-windows-latest pferd-windows.exe | ||||||
|  |           mv pferd-macos-13 pferd-mac-x86_64 | ||||||
|  |           mv pferd-macos-latest pferd-mac | ||||||
|  |  | ||||||
|  |       - name: Create release | ||||||
|  |         uses: softprops/action-gh-release@v2 | ||||||
|  |         env: | ||||||
|  |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||||
|  |         with: | ||||||
|  |           files: | | ||||||
|  |             pferd-linux | ||||||
|  |             pferd-windows.exe | ||||||
|  |             pferd-mac | ||||||
|  |             pferd-mac-x86_64 | ||||||
							
								
								
									
										74
									
								
								.github/workflows/package.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										74
									
								
								.github/workflows/package.yml
									
									
									
									
										vendored
									
									
								
							| @@ -1,74 +0,0 @@ | |||||||
| name: Package Application with Pyinstaller |  | ||||||
|  |  | ||||||
| on: |  | ||||||
|   push: |  | ||||||
|     branches: |  | ||||||
|       - "*" |  | ||||||
|     tags: |  | ||||||
|       - "v*" |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|   build: |  | ||||||
|  |  | ||||||
|     runs-on: ${{ matrix.os }} |  | ||||||
|     strategy: |  | ||||||
|       matrix: |  | ||||||
|         os: [ubuntu-latest, windows-latest, macos-latest] |  | ||||||
|  |  | ||||||
|     steps: |  | ||||||
|     - uses: actions/checkout@v2 |  | ||||||
|  |  | ||||||
|     - uses: actions/setup-python@v2 |  | ||||||
|       with: |  | ||||||
|         python-version: '3.x' |  | ||||||
|  |  | ||||||
|     - name: "Install dependencies" |  | ||||||
|       run: "pip install setuptools pyinstaller rich requests beautifulsoup4 -f --upgrade" |  | ||||||
|  |  | ||||||
|     - name: "Install sync_url.py" |  | ||||||
|       run: "pyinstaller sync_url.py -F" |  | ||||||
|  |  | ||||||
|     - name: "Move artifact" |  | ||||||
|       run: "mv dist/sync_url* dist/sync_url-${{ matrix.os }}" |  | ||||||
|  |  | ||||||
|     - uses: actions/upload-artifact@v2 |  | ||||||
|       with: |  | ||||||
|         name: "Pferd Sync URL" |  | ||||||
|         path: "dist/sync_url*" |  | ||||||
|  |  | ||||||
|   release: |  | ||||||
|     name: Release |  | ||||||
|  |  | ||||||
|     needs: [build] |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
|     if: startsWith(github.ref, 'refs/tags/') |  | ||||||
|  |  | ||||||
|     env: |  | ||||||
|       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |  | ||||||
|  |  | ||||||
|     steps: |  | ||||||
|     - name: "Checkout" |  | ||||||
|       uses: actions/checkout@v2 |  | ||||||
|  |  | ||||||
|     - name: "Download artifacts" |  | ||||||
|       uses: actions/download-artifact@v2 |  | ||||||
|       with: |  | ||||||
|         name: "Pferd Sync URL" |  | ||||||
|  |  | ||||||
|     - name: "look at folder structure" |  | ||||||
|       run: "ls -lah" |  | ||||||
|  |  | ||||||
|     - name: "Rename releases" |  | ||||||
|       run: "mv sync_url-macos-latest pferd_sync_url_mac && mv sync_url-ubuntu-latest pferd_sync_url_linux && mv sync_url-windows-latest pferd_sync_url.exe" |  | ||||||
|  |  | ||||||
|     - name: "Create release" |  | ||||||
|       uses: softprops/action-gh-release@v1 |  | ||||||
|  |  | ||||||
|     - name: "Upload release artifacts" |  | ||||||
|       uses: softprops/action-gh-release@v1 |  | ||||||
|       with: |  | ||||||
|         body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x <file>`. Also please enclose the *url you pass to the program in double quotes* or your shell might silently screw it up!" |  | ||||||
|         files: | |  | ||||||
|           pferd_sync_url_mac |  | ||||||
|           pferd_sync_url_linux |  | ||||||
|           pferd_sync_url.exe |  | ||||||
							
								
								
									
										21
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										21
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,14 +1,11 @@ | |||||||
| __pycache__/ |  | ||||||
| .venv/ |  | ||||||
| venv/ |  | ||||||
| .idea/ |  | ||||||
| build/ |  | ||||||
| .mypy_cache/ | .mypy_cache/ | ||||||
| .tmp/ | /.venv/ | ||||||
| .env | /PFERD.egg-info/ | ||||||
| .vscode | __pycache__/ | ||||||
| ilias_cookies.txt | /.vscode/ | ||||||
|  | /.idea/ | ||||||
|  |  | ||||||
| # PyInstaller | # pyinstaller | ||||||
| sync_url.spec | /pferd.spec | ||||||
| dist/ | /build/ | ||||||
|  | /dist/ | ||||||
|   | |||||||
							
								
								
									
										290
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,290 @@ | |||||||
|  | # Changelog | ||||||
|  |  | ||||||
|  | All notable changes to this project will be documented in this file. The format | ||||||
|  | is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). | ||||||
|  |  | ||||||
|  | This project has its own custom versioning scheme. Version numbers consist of | ||||||
|  | three parts (e. g. `3.1.5`). | ||||||
|  | - The first number is increased on major rewrites or changes. What classifies as | ||||||
|  |   a major change is up to the maintainers. This is pretty rare and a PFERD | ||||||
|  |   version 4 should hopefully not be necessary. | ||||||
|  | - The second number is increased on backwards-incompatible changes in behaviour. | ||||||
|  |   This refers to any change that would make an existing setup behave differently | ||||||
|  |   (e. g. renaming options or changing crawler behaviour). If this number is | ||||||
|  |   increased, it may be necessary for you to adapt your own setup. | ||||||
|  | - The third number is increased on backwards-compatible changes (e. g. adding | ||||||
|  |   new options or commands, changing documentation, fixing bugs). Updates that | ||||||
|  |   only increase this number should be safe and not require manual intervention. | ||||||
|  |  | ||||||
|  | We will try to correctly classify changes as backwards-compatible or | ||||||
|  | backwards-incompatible, but may occasionally make mistakes or stumble across | ||||||
|  | ambiguous situations. | ||||||
|  |  | ||||||
|  | ## Unreleased | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - Crawling of exercises with instructions | ||||||
|  |  | ||||||
|  | ## 3.8.2 - 2025-04-29 | ||||||
|  |  | ||||||
|  | ## Changed | ||||||
|  | - Explicitly mention that wikis are not supported at the moment and ignore them | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - Ilias-native login | ||||||
|  | - Exercise crawling | ||||||
|  |  | ||||||
|  | ## 3.8.1 - 2025-04-17 | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - Description html files now specify at UTF-8 encoding | ||||||
|  | - Images in descriptions now always have a white background | ||||||
|  |  | ||||||
|  | ## 3.8.0 - 2025-04-16 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Support for ILIAS 9 | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Added prettier CSS to forum threads | ||||||
|  | - Downloaded forum threads now link to the forum instead of the ILIAS thread | ||||||
|  | - Increase minimum supported Python version to 3.11 | ||||||
|  | - Do not crawl nested courses (courses linked in other courses) | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - File links in report on Windows | ||||||
|  | - TOTP authentication in KIT Shibboleth | ||||||
|  | - Forum crawling only considering the first 20 entries | ||||||
|  |  | ||||||
|  | ## 3.7.0 - 2024-11-13 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Support for MOB videos in page descriptions | ||||||
|  | - Clickable links in the report to directly open new/modified/not-deleted files | ||||||
|  | - Support for non KIT shibboleth login | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Remove videos from description pages | ||||||
|  | - Perform ILIAS cycle detection after processing the transform to allow | ||||||
|  |   ignoring duplicated elements | ||||||
|  | - Parse headings (h1-h3) as folders in kit-ipd crawler | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Personal desktop/dashboard/favorites crawling | ||||||
|  | - Crawling of nested courses | ||||||
|  | - Downloading of links with no target URL | ||||||
|  | - Handle row flex on description pages | ||||||
|  | - Add `<!DOCTYPE html>` heading to forum threads to fix mime type detection | ||||||
|  | - Handle groups in cards | ||||||
|  |  | ||||||
|  | ## 3.6.0 - 2024-10-23 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Generic `ilias-web` crawler and `ilias-web` CLI command | ||||||
|  | - Support for the course overview page. Using this URL as a target might cause | ||||||
|  |   duplication warnings, as subgroups are listed separately. | ||||||
|  | - Support for named capture groups in regex transforms | ||||||
|  | - Crawl custom item groups as folders | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Normalization of meeting names in cards | ||||||
|  | - Sanitization of slashes in exercise container names | ||||||
|  |  | ||||||
|  | ## 3.5.2 - 2024-04-14 | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crawling of personal desktop with ILIAS 8 | ||||||
|  | - Crawling of empty personal desktops | ||||||
|  |  | ||||||
|  | ## 3.5.1 - 2024-04-09 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Support for ILIAS 8 | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Video name deduplication | ||||||
|  |  | ||||||
|  | ## 3.5.0 - 2023-09-13 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `no-delete-prompt-override` conflict resolution strategy | ||||||
|  | - Support for ILIAS learning modules | ||||||
|  | - `show_not_deleted` option to stop printing the "Not Deleted" status or report | ||||||
|  |   message. This combines nicely with the `no-delete-prompt-override` strategy, | ||||||
|  |   causing PFERD to mostly ignore local-only files. | ||||||
|  | - Support for mediacast video listings | ||||||
|  | - Crawling of files in info tab | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Remove size suffix for files in content pages | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crawling of courses with the timeline view as the default tab | ||||||
|  | - Crawling of file and custom opencast cards | ||||||
|  | - Crawling of button cards without descriptions | ||||||
|  | - Abort crawling when encountering an unexpected ilias root page redirect | ||||||
|  | - Sanitize ascii control characters on Windows | ||||||
|  | - Crawling of paginated past meetings | ||||||
|  | - Ignore SCORM learning modules | ||||||
|  |  | ||||||
|  | ## 3.4.3 - 2022-11-29 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Missing documentation for `forums` option | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Clear up error message shown when multiple paths are found to an element | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - IPD crawler unnecessarily appending trailing slashes | ||||||
|  | - Crawling opencast when ILIAS is set to English | ||||||
|  |  | ||||||
|  | ## 3.4.2 - 2022-10-26 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Recognize and crawl content pages in cards | ||||||
|  | - Recognize and ignore surveys | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Forum crawling crashing when a thread has no messages at all | ||||||
|  | - Forum crawling crashing when a forum has no threads at all | ||||||
|  | - Ilias login failing in some cases | ||||||
|  | - Crawling of paginated future meetings | ||||||
|  | - IPD crawler handling of URLs without trailing slash | ||||||
|  |  | ||||||
|  | ## 3.4.1 - 2022-08-17 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Download of page descriptions | ||||||
|  | - Forum download support | ||||||
|  | - `pass` authenticator | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Add `cpp` extension to default `link_regex` of IPD crawler | ||||||
|  | - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option | ||||||
|  | - Simplify default IPD crawler `link_regex` | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - IPD crawler crashes on some sites | ||||||
|  | - Meeting name normalization for yesterday, today and tomorrow | ||||||
|  | - Crawling of meeting file previews | ||||||
|  | - Login with new login button html layout | ||||||
|  | - Descriptions for courses are now placed in the correct subfolder when | ||||||
|  |   downloading the whole desktop | ||||||
|  |  | ||||||
|  | ## 3.4.0 - 2022-05-01 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Message when Shibboleth entitlements need to be manually reviewed | ||||||
|  | - Links to unofficial packages and repology in the readme | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Increase minimum supported Python version to 3.9 | ||||||
|  | - Support video listings with more columns | ||||||
|  | - Use UTF-8 when reading/writing the config file | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crash during authentication when the Shibboleth session is still valid | ||||||
|  |  | ||||||
|  | ## 3.3.1 - 2022-01-15 | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - ILIAS login | ||||||
|  | - Local video cache if `windows_paths` is enabled | ||||||
|  |  | ||||||
|  | ## 3.3.0 - 2022-01-09 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - A KIT IPD crawler | ||||||
|  | - Support for ILIAS cards | ||||||
|  | - (Rudimentary) support for content pages | ||||||
|  | - Support for multi-stream videos | ||||||
|  | - Support for ILIAS 7 | ||||||
|  |  | ||||||
|  | ### Removed | ||||||
|  | - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crawling of recursive courses | ||||||
|  | - Crawling files directly placed on the personal desktop | ||||||
|  | - Ignore timestamps at the unix epoch as they crash on windows | ||||||
|  |  | ||||||
|  | ## 3.2.0 - 2021-08-04 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `--skip` command line option | ||||||
|  | - Support for ILIAS booking objects | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Using multiple path segments on left side of `-name->` now results in an | ||||||
|  |   error. This was already forbidden by the documentation but silently accepted | ||||||
|  |   by PFERD. | ||||||
|  | - More consistent path printing in some `--explain` messages | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Nondeterministic name deduplication due to ILIAS reordering elements | ||||||
|  | - More exceptions are handled properly | ||||||
|  |  | ||||||
|  | ## 3.1.0 - 2021-06-13 | ||||||
|  |  | ||||||
|  | If your config file doesn't do weird things with transforms, it should continue | ||||||
|  | to work. If your `-re->` arrows behave weirdly, try replacing them with | ||||||
|  | `-exact-re->` arrows. If you're on Windows, you might need to switch from `\` | ||||||
|  | path separators to `/` in your regex rules. | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `skip` option for crawlers | ||||||
|  | - Rules with `>>` instead of `>` as arrow head | ||||||
|  | - `-exact-re->` arrow (behaves like `-re->` did previously) | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - The `-re->` arrow can now rename directories (like `-->`) | ||||||
|  | - Use `/` instead of `\` as path separator for (regex) rules on Windows | ||||||
|  | - Use the label to the left for exercises instead of the button name to | ||||||
|  |   determine the folder name | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Video pagination handling in ILIAS crawler | ||||||
|  |  | ||||||
|  | ## 3.0.1 - 2021-06-01 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `credential-file` authenticator | ||||||
|  | - `--credential-file` option for `kit-ilias-web` command | ||||||
|  | - Warning if using concurrent tasks with `kit-ilias-web` | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Cookies are now stored in a text-based format | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Date parsing now also works correctly in non-group exercises | ||||||
|  |  | ||||||
|  | ## 3.0.0 - 2021-05-31 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Proper config files | ||||||
|  | - Concurrent crawling | ||||||
|  | - Crawl external ILIAS links | ||||||
|  | - Crawl uploaded exercise solutions | ||||||
|  | - Explain what PFERD is doing and why (`--explain`) | ||||||
|  | - More control over output (`--status`, `--report`) | ||||||
|  | - Debug transform rules with `--debug-transforms` | ||||||
|  | - Print report after exiting via Ctrl+C | ||||||
|  | - Store crawler reports in `.report` JSON file | ||||||
|  | - Extensive config file documentation (`CONFIG.md`) | ||||||
|  | - Documentation for developers (`DEV.md`) | ||||||
|  | - This changelog | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Rewrote almost everything | ||||||
|  | - Better error messages | ||||||
|  | - Redesigned CLI | ||||||
|  | - Redesigned transform rules | ||||||
|  | - ILIAS crawling logic (paths may be different) | ||||||
|  | - Better support for weird paths on Windows | ||||||
|  | - Set user agent (`PFERD/<version>`) | ||||||
|  |  | ||||||
|  | ### Removed | ||||||
|  | - Backwards compatibility with 2.x | ||||||
|  | - Python files as config files | ||||||
|  | - Some types of crawlers | ||||||
							
								
								
									
										537
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										537
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,537 @@ | |||||||
|  | # Config file format | ||||||
|  |  | ||||||
|  | A config file consists of sections. A section begins with a `[section]` header, | ||||||
|  | which is followed by a list of `key = value` pairs. Comments must be on their | ||||||
|  | own line and start with `#`. Multiline values must be indented beyond their key. | ||||||
|  | Boolean values can be `yes` or `no`. For more details and some examples on the | ||||||
|  | format, see the [configparser documentation][cp-file] | ||||||
|  | ([interpolation][cp-interp] is disabled). | ||||||
|  |  | ||||||
|  | [cp-file]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure" | ||||||
|  | [cp-interp]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values" | ||||||
|  |  | ||||||
|  | ## The `DEFAULT` section | ||||||
|  |  | ||||||
|  | This section contains global configuration values. It can also be used to set | ||||||
|  | default values for the other sections. | ||||||
|  |  | ||||||
|  | - `working_dir`: The directory PFERD operates in. Set to an absolute path to | ||||||
|  |   make PFERD operate the same regardless of where it is executed from. All other | ||||||
|  |   paths in the config file are interpreted relative to this path. If this path | ||||||
|  |   is relative, it is interpreted relative to the script's working dir. `~` is | ||||||
|  |   expanded to the current user's home directory. (Default: `.`) | ||||||
|  | - `explain`: Whether PFERD should log and explain its actions and decisions in | ||||||
|  |   detail. (Default: `no`) | ||||||
|  | - `status`: Whether PFERD should print status updates (like `Crawled ...`, | ||||||
|  |   `Added ...`) while running a crawler. (Default: `yes`) | ||||||
|  | - `report`: Whether PFERD should print a report of added, changed and deleted | ||||||
|  |    local files for all crawlers before exiting. (Default: `yes`) | ||||||
|  | - `show_not_deleted`: Whether PFERD should print messages in status and report | ||||||
|  |    when a local-only file wasn't deleted. Combines nicely with the | ||||||
|  |    `no-delete-prompt-override` conflict resolution strategy. | ||||||
|  | - `share_cookies`: Whether crawlers should share cookies where applicable. For | ||||||
|  |   example, some crawlers share cookies if they crawl the same website using the | ||||||
|  |   same account. (Default: `yes`) | ||||||
|  |  | ||||||
|  | ## The `crawl:*` sections | ||||||
|  |  | ||||||
|  | Sections whose names start with `crawl:` are used to configure crawlers. The | ||||||
|  | rest of the section name specifies the name of the crawler. | ||||||
|  |  | ||||||
|  | A crawler synchronizes a remote resource to a local directory. There are | ||||||
|  | different types of crawlers for different kinds of resources, e.g. ILIAS | ||||||
|  | courses or lecture websites. | ||||||
|  |  | ||||||
|  | Each crawl section represents an instance of a specific type of crawler. The | ||||||
|  | `type` option is used to specify the crawler type. The crawler's name is usually | ||||||
|  | used as the output directory. New crawlers can be created simply by adding a new | ||||||
|  | crawl section to the config file. | ||||||
|  |  | ||||||
|  | Depending on a crawler's type, it may have different options. For more details, | ||||||
|  | see the type's [documentation](#crawler-types) below. The following options are | ||||||
|  | common to all crawlers: | ||||||
|  |  | ||||||
|  | - `type`: The available types are specified in [this section](#crawler-types). | ||||||
|  | - `skip`: Whether the crawler should be skipped during normal execution. The | ||||||
|  |   crawler can still be executed manually using the `--crawler` or `-C` flags. | ||||||
|  |   (Default: `no`) | ||||||
|  | - `output_dir`: The directory the crawler synchronizes files to. A crawler will | ||||||
|  |   never place any files outside this directory. (Default: the crawler's name) | ||||||
|  | - `redownload`: When to download a file that is already present locally. | ||||||
|  |   (Default: `never-smart`) | ||||||
|  |     - `never`: If a file is present locally, it is not downloaded again. | ||||||
|  |     - `never-smart`: Like `never`, but PFERD tries to detect if an already | ||||||
|  |       downloaded files has changed via some (unreliable) heuristics. | ||||||
|  |     - `always`: All files are always downloaded, regardless of whether they are | ||||||
|  |       already present locally. | ||||||
|  |     - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary | ||||||
|  |       downloads via some (unreliable) heuristics. | ||||||
|  | - `on_conflict`: What to do when the local and remote versions of a file or | ||||||
|  |   directory differ, including when a file is replaced by a directory or a | ||||||
|  |   directory by a file. (Default: `prompt`) | ||||||
|  |     - `prompt`: Always ask the user before overwriting or deleting local files | ||||||
|  |       and directories. | ||||||
|  |     - `local-first`: Always keep the local file or directory. Equivalent to | ||||||
|  |       using `prompt` and always choosing "no". Implies that `redownload` is set | ||||||
|  |       to `never`. | ||||||
|  |     - `remote-first`: Always keep the remote file or directory. Equivalent to | ||||||
|  |       using `prompt` and always choosing "yes". | ||||||
|  |     - `no-delete`: Never delete local files, but overwrite local files if the | ||||||
|  |       remote file is different. | ||||||
|  |     - `no-delete-prompt-overwrite`: Never delete local files, but prompt to | ||||||
|  |       overwrite local files if the remote file is different. Combines nicely | ||||||
|  |       with the `show_not_deleted` option. | ||||||
|  | - `transform`: Rules for renaming and excluding certain files and directories. | ||||||
|  |   For more details, see [this section](#transformation-rules). (Default: empty) | ||||||
|  | - `tasks`: The maximum number of concurrent tasks (such as crawling or | ||||||
|  |   downloading). (Default: `1`) | ||||||
|  | - `downloads`: How many of those tasks can be download tasks at the same time. | ||||||
|  |   Must not be greater than `tasks`. (Default: Same as `tasks`) | ||||||
|  | - `task_delay`: Time (in seconds) that the crawler should wait between | ||||||
|  |   subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary | ||||||
|  |   load for the crawl target. (Default: `0.0`) | ||||||
|  | - `windows_paths`: Whether PFERD should find alternative names for paths that | ||||||
|  |   are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) | ||||||
|  |  | ||||||
|  | Some crawlers may also require credentials for authentication. To configure how | ||||||
|  | the crawler obtains its credentials, the `auth` option is used. It is set to the | ||||||
|  | full name of an auth section (including the `auth:` prefix). | ||||||
|  |  | ||||||
|  | Here is a simple example: | ||||||
|  |  | ||||||
|  | ```ini | ||||||
|  | [auth:example] | ||||||
|  | type = simple | ||||||
|  | username = foo | ||||||
|  | password = bar | ||||||
|  |  | ||||||
|  | [crawl:something] | ||||||
|  | type = some-complex-crawler | ||||||
|  | auth = auth:example | ||||||
|  | on_conflict = no-delete | ||||||
|  | tasks = 3 | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ## The `auth:*` sections | ||||||
|  |  | ||||||
|  | Sections whose names start with `auth:` are used to configure authenticators. An | ||||||
|  | authenticator provides a username and a password to one or more crawlers. | ||||||
|  |  | ||||||
|  | Authenticators work similar to crawlers: A section represents an authenticator | ||||||
|  | instance whose name is the rest of the section name. The type is specified by | ||||||
|  | the `type` option. | ||||||
|  |  | ||||||
|  | Depending on an authenticator's type, it may have different options. For more | ||||||
|  | details, see the type's [documentation](#authenticator-types) below. The only | ||||||
|  | option common to all authenticators is `type`: | ||||||
|  |  | ||||||
|  | - `type`: The types are specified in [this section](#authenticator-types). | ||||||
|  |  | ||||||
|  | ## Crawler types | ||||||
|  |  | ||||||
|  | ### The `local` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls a local directory. It is really simple and mostly useful for | ||||||
|  | testing different setups. The various delay options are meant to make the | ||||||
|  | crawler simulate a slower, network-based crawler. | ||||||
|  |  | ||||||
|  | - `target`: Path to the local directory to crawl. (Required) | ||||||
|  | - `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests. | ||||||
|  |   (Default: `0.0`) | ||||||
|  | - `download_delay`: Artificial delay (in seconds) to simulate for download | ||||||
|  |   requests. (Default: `0.0`) | ||||||
|  | - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) | ||||||
|  |  | ||||||
|  | ### The `kit-ipd` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls a KIT-IPD page by url. The root page can be crawled from | ||||||
|  | outside the KIT network so you will be informed about any new/deleted files, | ||||||
|  | but downloading files requires you to be within. Adding a short delay between | ||||||
|  | requests is likely a good idea. | ||||||
|  |  | ||||||
|  | - `target`: URL to a KIT-IPD page | ||||||
|  | - `link_regex`: A regex that is matched against the `href` part of links. If it | ||||||
|  |   matches, the given link is downloaded as a file. This is used to extract | ||||||
|  |   files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) | ||||||
|  |  | ||||||
|  | ### The `ilias-web` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls a generic ILIAS instance. | ||||||
|  |  | ||||||
|  | Inspired by [this ILIAS downloader][ilias-dl], the following configurations should work | ||||||
|  | out of the box for the corresponding universities: | ||||||
|  |  | ||||||
|  | [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" | ||||||
|  |  | ||||||
|  | | University    | `base_url`                              | `login_type` | `client_id`   | | ||||||
|  | |---------------|-----------------------------------------|--------------|---------------| | ||||||
|  | | FH Aachen     | https://www.ili.fh-aachen.de            | local        | elearning     | | ||||||
|  | | Uni Köln      | https://www.ilias.uni-koeln.de/ilias    | local        | uk            | | ||||||
|  | | Uni Konstanz  | https://ilias.uni-konstanz.de           | local        | ILIASKONSTANZ | | ||||||
|  | | Uni Stuttgart | https://ilias3.uni-stuttgart.de         | local        | Uni_Stuttgart | | ||||||
|  | | Uni Tübingen  | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth   |               | | ||||||
|  |  | ||||||
|  | If your university isn't listed, try navigating to your instance's login page. | ||||||
|  | Assuming no custom login service is used, the URL will look something like this: | ||||||
|  |  | ||||||
|  | ```jinja | ||||||
|  | {{ base_url }}/login.php?client_id={{ client_id }}&cmd=force_login&lang= | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | If the values work, feel free to submit a PR and add them to the table above. | ||||||
|  |  | ||||||
|  | - `base_url`: The URL where the ILIAS instance is located. (Required) | ||||||
|  | - `login_type`: How you authenticate. (Required) | ||||||
|  |     - `local`: Use `client_id` for authentication. | ||||||
|  |     - `shibboleth`: Use shibboleth for authentication. | ||||||
|  | - `client_id`: An ID used for authentication if `login_type` is `local`. Is | ||||||
|  |   ignored if `login_type` is `shibboleth`. | ||||||
|  | - `target`: The ILIAS element to crawl. (Required) | ||||||
|  |     - `desktop`: Crawl your personal desktop / dashboard | ||||||
|  |     - `<course id>`: Crawl the course with the given id | ||||||
|  |     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||||
|  |       at the bottom of its ILIAS page).   | ||||||
|  |       This also supports the "My Courses" overview page to download *all* | ||||||
|  |       courses. Note that this might produce confusing local directory layouts | ||||||
|  |       and duplication warnings if you are a member of an ILIAS group. The | ||||||
|  |       `desktop` target is generally preferable. | ||||||
|  | - `auth`: Name of auth section to use for login. (Required) | ||||||
|  | - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||||
|  |   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||||
|  | - `links`: How to represent external links. (Default: `fancy`) | ||||||
|  |     - `ignore`: Don't download links. | ||||||
|  |     - `plaintext`: A text file containing only the URL. | ||||||
|  |     - `fancy`: A HTML file looking like the ILIAS link element. | ||||||
|  |     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||||
|  | - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||||
|  |   redirect to the actual URL. Set to a negative value to disable the automatic | ||||||
|  |   redirect. (Default: `-1`) | ||||||
|  | - `videos`: Whether to download videos. (Default: `no`) | ||||||
|  | - `forums`: Whether to download forum threads. (Default: `no`) | ||||||
|  | - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||||
|  |   `20.0`) | ||||||
|  |  | ||||||
|  | ### The `kit-ilias-web` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls the KIT ILIAS instance. | ||||||
|  |  | ||||||
|  | ILIAS is not great at handling too many concurrent requests. To avoid | ||||||
|  | unnecessary load, please limit `tasks` to `1`. | ||||||
|  |  | ||||||
|  | There is a spike in ILIAS usage at the beginning of lectures, so please don't | ||||||
|  | run PFERD during those times. | ||||||
|  |  | ||||||
|  | If you're automatically running PFERD periodically (e. g. via cron or a systemd | ||||||
|  | timer), please randomize the start time or at least don't use the full hour. For | ||||||
|  | systemd timers, this can be accomplished using the `RandomizedDelaySec` option. | ||||||
|  | Also, please schedule the script to run in periods of low activity. Running the | ||||||
|  | script once per day should be fine. | ||||||
|  |  | ||||||
|  | - `target`: The ILIAS element to crawl. (Required) | ||||||
|  |     - `desktop`: Crawl your personal desktop | ||||||
|  |     - `<course id>`: Crawl the course with the given id | ||||||
|  |     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||||
|  |       at the bottom of its ILIAS page) | ||||||
|  | - `auth`: Name of auth section to use for login. (Required) | ||||||
|  | - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||||
|  |   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||||
|  | - `links`: How to represent external links. (Default: `fancy`) | ||||||
|  |     - `ignore`: Don't download links. | ||||||
|  |     - `plaintext`: A text file containing only the URL. | ||||||
|  |     - `fancy`: A HTML file looking like the ILIAS link element. | ||||||
|  |     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||||
|  | - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||||
|  |   redirect to the actual URL. Set to a negative value to disable the automatic | ||||||
|  |   redirect. (Default: `-1`) | ||||||
|  | - `videos`: Whether to download videos. (Default: `no`) | ||||||
|  | - `forums`: Whether to download forum threads. (Default: `no`) | ||||||
|  | - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||||
|  |   `20.0`) | ||||||
|  |  | ||||||
|  | ## Authenticator types | ||||||
|  |  | ||||||
|  | ### The `simple` authenticator | ||||||
|  |  | ||||||
|  | With this authenticator, the username and password can be set directly in the | ||||||
|  | config file. If the username or password are not specified, the user is prompted | ||||||
|  | via the terminal. | ||||||
|  |  | ||||||
|  | - `username`: The username. (Optional) | ||||||
|  | - `password`: The password. (Optional) | ||||||
|  |  | ||||||
|  | ### The `credential-file` authenticator | ||||||
|  |  | ||||||
|  | This authenticator reads a username and a password from a credential file. | ||||||
|  |  | ||||||
|  | - `path`: Path to the credential file. (Required) | ||||||
|  |  | ||||||
|  | The credential file has exactly two lines (trailing newline optional). The first | ||||||
|  | line starts with `username=` and contains the username, the second line starts | ||||||
|  | with `password=` and contains the password. The username and password may | ||||||
|  | contain any characters except a line break. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | username=AzureDiamond | ||||||
|  | password=hunter2 | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ### The `keyring` authenticator | ||||||
|  |  | ||||||
|  | This authenticator uses the system keyring to store passwords. The username can | ||||||
|  | be set directly in the config file. If the username is not specified, the user | ||||||
|  | is prompted via the terminal. If the keyring contains no entry or the entry is | ||||||
|  | incorrect, the user is prompted for a password via the terminal and the password | ||||||
|  | is stored in the keyring. | ||||||
|  |  | ||||||
|  | - `username`: The username. (Optional) | ||||||
|  | - `keyring_name`: The service name PFERD uses for storing credentials. (Default: | ||||||
|  |   `PFERD`) | ||||||
|  |  | ||||||
|  | ### The `pass` authenticator | ||||||
|  |  | ||||||
|  | This authenticator queries the [`pass` password manager][pass] for a username | ||||||
|  | and password. It tries to be mostly compatible with [browserpass][browserpass] | ||||||
|  | and [passff][passff], so see those links for an overview of the format. If PFERD | ||||||
|  | fails to load your password, you can use the `--explain` flag to see why. | ||||||
|  |  | ||||||
|  | - `passname`: The name of the password to use (Required) | ||||||
|  | - `username_prefixes`: A comma-separated list of username line prefixes | ||||||
|  |   (Default: `login,username,user`) | ||||||
|  | - `password_prefixes`: A comma-separated list of password line prefixes | ||||||
|  |   (Default: `password,pass,secret`) | ||||||
|  |  | ||||||
|  | [pass]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager" | ||||||
|  | [browserpass]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store" | ||||||
|  | [passff]: <https://github.com/passff/passff#multi-line-format> "Multi-line format" | ||||||
|  |  | ||||||
|  | ### The `tfa` authenticator | ||||||
|  |  | ||||||
|  | This authenticator prompts the user on the console for a two-factor | ||||||
|  | authentication token. The token is provided as password and it is not cached. | ||||||
|  | This authenticator does not support usernames. | ||||||
|  |  | ||||||
|  | ## Transformation rules | ||||||
|  |  | ||||||
|  | Transformation rules are rules for renaming and excluding files and directories. | ||||||
|  | They are specified line-by-line in a crawler's `transform` option. When a | ||||||
|  | crawler needs to apply a rule to a path, it goes through this list top-to-bottom | ||||||
|  | and applies the first matching rule. | ||||||
|  |  | ||||||
|  | To see this process in action, you can use the `--debug-transforms` or flag or | ||||||
|  | the `--explain` flag. | ||||||
|  |  | ||||||
|  | Each rule has the format `SOURCE ARROW TARGET` (e. g. `foo/bar --> foo/baz`). | ||||||
|  | The arrow specifies how the source and target are interpreted. The different | ||||||
|  | kinds of arrows are documented below. | ||||||
|  |  | ||||||
|  | `SOURCE` and `TARGET` are either a bunch of characters without spaces (e. g. | ||||||
|  | `foo/bar`) or string literals (e. g, `"foo/b a r"`). The former syntax has no | ||||||
|  | concept of escaping characters, so the backslash is just another character. The | ||||||
|  | string literals however support Python's escape syntax (e. g. | ||||||
|  | `"foo\\bar\tbaz"`). This also means that in string literals, backslashes must be | ||||||
|  | escaped. | ||||||
|  |  | ||||||
|  | `TARGET` can additionally be a single exclamation mark `!` (*not* `"!"`). When a | ||||||
|  | rule with a `!` as target matches a path, the corresponding file or directory is | ||||||
|  | ignored by the crawler instead of renamed. | ||||||
|  |  | ||||||
|  | `TARGET` can also be omitted entirely. When a rule without target matches a | ||||||
|  | path, the path is returned unmodified. This is useful to prevent rules further | ||||||
|  | down from matching instead. | ||||||
|  |  | ||||||
|  | Each arrow's behaviour can be modified slightly by changing the arrow's head | ||||||
|  | from `>` to `>>`. When a rule with a `>>` arrow head matches a path, it doesn't | ||||||
|  | return immediately like a normal arrow. Instead, it replaces the current path | ||||||
|  | with its output and continues on to the next rule. In effect, this means that | ||||||
|  | multiple rules can be applied sequentially. | ||||||
|  |  | ||||||
|  | ### The `-->` arrow | ||||||
|  |  | ||||||
|  | The `-->` arrow is a basic renaming operation for files and directories. If a | ||||||
|  | path matches `SOURCE`, it is renamed to `TARGET`. | ||||||
|  |  | ||||||
|  | Example: `foo/bar --> baz` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Converts `foo/bar` into `baz` | ||||||
|  | - Converts `foo/bar/wargl` into `baz/wargl` | ||||||
|  |  | ||||||
|  | Example: `foo/bar --> !` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Ignores `foo/bar` and any of its children | ||||||
|  |  | ||||||
|  | ### The `-name->` arrow | ||||||
|  |  | ||||||
|  | The `-name->` arrow lets you rename files and directories by their name, | ||||||
|  | regardless of where they appear in the file tree. Because of this, its `SOURCE` | ||||||
|  | must not contain multiple path segments, only a single name. This restriction | ||||||
|  | does not apply to its `TARGET`. | ||||||
|  |  | ||||||
|  | Example: `foo -name-> bar/baz` | ||||||
|  | - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||||
|  | - Converts `hello/foo` into `hello/bar/baz` | ||||||
|  | - Converts `foo/world` into `bar/baz/world` | ||||||
|  | - Converts `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz` | ||||||
|  |  | ||||||
|  | Example: `foo -name-> !` | ||||||
|  | - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||||
|  | - Ignores any path containing a segment `foo` | ||||||
|  |  | ||||||
|  | ### The `-exact->` arrow | ||||||
|  |  | ||||||
|  | The `-exact->` arrow requires the path to match `SOURCE` exactly. The examples | ||||||
|  | below show why this is useful. | ||||||
|  |  | ||||||
|  | Example: `foo/bar -exact-> baz` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Converts `foo/bar` into `baz` | ||||||
|  | - Doesn't match `foo/bar/wargl` | ||||||
|  |  | ||||||
|  | Example: `foo/bar -exact-> !` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Ignores only `foo/bar`, not its children | ||||||
|  |  | ||||||
|  | ### The `-re->` arrow | ||||||
|  |  | ||||||
|  | The `-re->` arrow is like the `-->` arrow but with regular expressions. `SOURCE` | ||||||
|  | is a regular expression and `TARGET` an f-string based template. If a path | ||||||
|  | matches `SOURCE`, the output path is created using `TARGET` as template. | ||||||
|  | `SOURCE` is automatically anchored. | ||||||
|  |  | ||||||
|  | `TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can | ||||||
|  | be referred to as `{g<n>}` (e.g. `{g3}`). `{g0}` refers to the original path. | ||||||
|  | If capturing group *n*'s contents are a valid integer, the integer value is | ||||||
|  | available as `{i<n>}` (e.g. `{i3}`). If capturing group *n*'s contents are a | ||||||
|  | valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). Named capture | ||||||
|  | groups (e.g. `(?P<name>)`) are available by their name (e.g. `{name}`). If a | ||||||
|  | capturing group is not present (e.g. when matching the string `cd` with the | ||||||
|  | regex `(ab)?cd`), the corresponding variables are not defined. | ||||||
|  |  | ||||||
|  | Python's format string syntax has rich options for formatting its arguments. For | ||||||
|  | example, to left-pad the capturing group 3 with the digit `0` to width 5, you | ||||||
|  | can use `{i3:05}`. | ||||||
|  |  | ||||||
|  | PFERD even allows you to write entire expressions inside the curly braces, for | ||||||
|  | example `{g2.lower()}` or `{g3.replace(' ', '_')}`. | ||||||
|  |  | ||||||
|  | Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` | ||||||
|  | - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||||
|  | - Converts `foo/bar` into `BOOH/fear` | ||||||
|  | - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||||
|  | - Converts `foo/bar/baz` into `BOOH/fear/baz` | ||||||
|  |  | ||||||
|  | [6]: <https://docs.python.org/3/library/string.html#format-string-syntax> "Format String Syntax" | ||||||
|  |  | ||||||
|  | ### The `-name-re->` arrow | ||||||
|  |  | ||||||
|  | The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. | ||||||
|  |  | ||||||
|  | Example: `(.*)\.jpeg -name-re-> {g1}.jpg` | ||||||
|  | - Doesn't match `foo/bar.png`, `baz.JPEG` or `hello,jpeg` | ||||||
|  | - Converts `foo/bar.jpeg` into `foo/bar.jpg` | ||||||
|  | - Converts `foo.jpeg/bar/baz.jpeg` into `foo.jpg/bar/baz.jpg` | ||||||
|  |  | ||||||
|  | Example: `\..+ -name-re-> !` | ||||||
|  | - Doesn't match `.`, `test`, `a.b` | ||||||
|  | - Ignores all files and directories starting with `.`. | ||||||
|  |  | ||||||
|  | ### The `-exact-re->` arrow | ||||||
|  |  | ||||||
|  | The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` | ||||||
|  | arrows. | ||||||
|  |  | ||||||
|  | Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` | ||||||
|  | - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||||
|  | - Converts `foo/bar` into `BOOH/fear` | ||||||
|  | - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||||
|  | - Doesn't match `foo/bar/baz` | ||||||
|  |  | ||||||
|  | ### Example: Tutorials | ||||||
|  |  | ||||||
|  | You have an ILIAS course with lots of tutorials, but are only interested in a | ||||||
|  | single one. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | tutorials/ | ||||||
|  |   |- tut_01/ | ||||||
|  |   |- tut_02/ | ||||||
|  |   |- tut_03/ | ||||||
|  |   ... | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | You can use a mix of normal and exact arrows to get rid of the other ones and | ||||||
|  | move the `tutorials/tut_02/` folder to `my_tut/`: | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | tutorials/tut_02 --> my_tut | ||||||
|  | tutorials -exact-> | ||||||
|  | tutorials --> ! | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | The second rule is required for many crawlers since they use the rules to decide | ||||||
|  | which directories to crawl. If it was missing when the crawler looks at | ||||||
|  | `tutorials/`, the third rule would match. This means the crawler would not crawl | ||||||
|  | the `tutorials/` directory and thus not discover that `tutorials/tut02/` exists. | ||||||
|  |  | ||||||
|  | Since the second rule is only relevant for crawling, the `TARGET` is left out. | ||||||
|  |  | ||||||
|  | ### Example: Lecture slides | ||||||
|  |  | ||||||
|  | You have a course with slides like `Lecture 3: Linear functions.PDF` and you | ||||||
|  | would like to rename them to `03_linear_functions.pdf`. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | Lectures/ | ||||||
|  |   |- Lecture 1: Introduction.PDF | ||||||
|  |   |- Lecture 2: Vectors and matrices.PDF | ||||||
|  |   |- Lecture 3: Linear functions.PDF | ||||||
|  |   ... | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | To do this, you can use the most powerful of arrows: The regex arrow. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | "Lectures/Lecture (\\d+): (.*)\\.PDF" -re-> "Lectures/{i1:02}_{g2.lower().replace(' ', '_')}.pdf" | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | Note the escaped backslashes on the `SOURCE` side. | ||||||
|  |  | ||||||
|  | ### Example: Crawl a Python project | ||||||
|  |  | ||||||
|  | You are crawling a Python project and want to ignore all hidden files (files | ||||||
|  | whose name starts with a `.`), all `__pycache__` directories and all markdown | ||||||
|  | files (for some weird reason). | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | .gitignore | ||||||
|  | .mypy_cache/ | ||||||
|  | .venv/ | ||||||
|  | CONFIG.md | ||||||
|  | PFERD/ | ||||||
|  |   |- __init__.py | ||||||
|  |   |- __main__.py | ||||||
|  |   |- __pycache__/ | ||||||
|  |   |- authenticator.py | ||||||
|  |   |- config.py | ||||||
|  |   ... | ||||||
|  | README.md | ||||||
|  | ... | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | For this task, the name arrows can be used. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | \..*        -name-re-> ! | ||||||
|  | __pycache__ -name->    ! | ||||||
|  | .*\.md      -name-re-> ! | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ### Example: Clean up names | ||||||
|  |  | ||||||
|  | You want to convert all paths into lowercase and replace spaces with underscores | ||||||
|  | before applying any rules. This can be achieved using the `>>` arrow heads. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | (.*) -re->> "{g1.lower().replace(' ', '_')}" | ||||||
|  |  | ||||||
|  | <other rules go here> | ||||||
|  | ``` | ||||||
							
								
								
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,89 @@ | |||||||
|  | # PFERD Development Guide | ||||||
|  |  | ||||||
|  | PFERD is packaged following the [Python Packaging User Guide][ppug] (in | ||||||
|  | particular [this][ppug-1] and [this][ppug-2] guide). | ||||||
|  |  | ||||||
|  | [ppug]: <https://packaging.python.org/> "Python Packaging User Guide" | ||||||
|  | [ppug-1]: <https://packaging.python.org/tutorials/packaging-projects/> "Packaging Python Projects" | ||||||
|  | [ppug-2]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/> "Packaging and distributing projects" | ||||||
|  |  | ||||||
|  | ## Setting up a dev environment | ||||||
|  |  | ||||||
|  | The use of [venv][venv] is recommended. To initially set up a development | ||||||
|  | environment, run these commands in the same directory as this file: | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | $ python -m venv .venv | ||||||
|  | $ . .venv/bin/activate | ||||||
|  | $ ./scripts/setup | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | The setup script installs a few required dependencies and tools. It also | ||||||
|  | installs PFERD via `pip install --editable .`, which means that you can just run | ||||||
|  | `pferd` as if it was installed normally. Since PFERD was installed with | ||||||
|  | `--editable`, there is no need to re-run `pip install` when the source code is | ||||||
|  | changed. | ||||||
|  |  | ||||||
|  | If you get any errors because pip can't update itself, try running | ||||||
|  | `./scripts/setup --no-pip` instead of `./scripts/setup`. | ||||||
|  |  | ||||||
|  | For more details, see [this part of the Python Tutorial][venv-tut] and | ||||||
|  | [this section on "development mode"][ppug-dev]. | ||||||
|  |  | ||||||
|  | [venv]: <https://docs.python.org/3/library/venv.html> "venv - Creation of virtual environments" | ||||||
|  | [venv-tut]: <https://docs.python.org/3/tutorial/venv.html> "12. Virtual Environments and Packages" | ||||||
|  | [ppug-dev]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/#working-in-development-mode> "Working in “development mode”" | ||||||
|  |  | ||||||
|  | ## Checking and formatting the code | ||||||
|  |  | ||||||
|  | To run a set of checks against the code, run `./scripts/check` in the repo's | ||||||
|  | root directory. This script will run a few tools installed by `./scripts/setup` | ||||||
|  | against the entire project. | ||||||
|  |  | ||||||
|  | To format the code, run `./scripts/format` in the repo's root directory. | ||||||
|  |  | ||||||
|  | Before committing changes, please make sure the checks return no warnings and | ||||||
|  | the code is formatted. | ||||||
|  |  | ||||||
|  | ## Contributing | ||||||
|  |  | ||||||
|  | When submitting a PR that adds, changes or modifies a feature, please ensure | ||||||
|  | that the corresponding documentation is updated as well. Also, please ensure | ||||||
|  | that `./scripts/check` returns no warnings and the code has been run through | ||||||
|  | `./scripts/format`. | ||||||
|  |  | ||||||
|  | In your first PR, please add your name to the `LICENSE` file. | ||||||
|  |  | ||||||
|  | ## Releasing a new version | ||||||
|  |  | ||||||
|  | This section describes the steps required to release a new version of PFERD. | ||||||
|  | Usually, they don't need to performed manually and `scripts/bump-version` can be | ||||||
|  | used instead. | ||||||
|  |  | ||||||
|  | 1. Update the version number in `PFERD/version.py` | ||||||
|  | 2. Update `CHANGELOG.md` | ||||||
|  | 3. Commit changes to `master` with message `Bump version to <version>` (e. g. `Bump version to 3.2.5`) | ||||||
|  | 4. Create annotated tag named `v<version>` (e. g. `v3.2.5`) | ||||||
|  |     - Copy changes from changelog | ||||||
|  |     - Remove `#` symbols (which git would interpret as comments) | ||||||
|  |     - As the first line, add `Version <version> - <date>` (e. g. `Version 3.2.5 - 2021-05-24`) | ||||||
|  |     - Leave the second line empty | ||||||
|  | 5. Fast-forward `latest` to `master` | ||||||
|  | 6. Push `master`, `latest` and the new tag | ||||||
|  |  | ||||||
|  | Example tag annotation: | ||||||
|  | ``` | ||||||
|  | Version 3.2.5 - 2021-05-24 | ||||||
|  |  | ||||||
|  | Added | ||||||
|  | - Support for concurrent downloads | ||||||
|  | - Support for proper config files | ||||||
|  | - This changelog | ||||||
|  |  | ||||||
|  | Changed | ||||||
|  | - Rewrote almost everything | ||||||
|  | - Redesigned CLI | ||||||
|  |  | ||||||
|  | Removed | ||||||
|  | - Backwards compatibility with 2.x | ||||||
|  | ``` | ||||||
							
								
								
									
										4
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								LICENSE
									
									
									
									
									
								
							| @@ -1,4 +1,6 @@ | |||||||
| Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw | Copyright 2019-2024 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||||
|  |                     TheChristophe, Scriptim, thelukasprobst, Toorero, | ||||||
|  |                     Mr-Pine, p-fruck, PinieP | ||||||
|  |  | ||||||
| Permission is hereby granted, free of charge, to any person obtaining a copy of | Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||||
| this software and associated documentation files (the "Software"), to deal in | this software and associated documentation files (the "Software"), to deal in | ||||||
|   | |||||||
| @@ -1,8 +0,0 @@ | |||||||
| # pylint: disable=invalid-name |  | ||||||
|  |  | ||||||
| """ |  | ||||||
| This module exports only what you need for a basic configuration. If you want a |  | ||||||
| more complex configuration, you need to import the other submodules manually. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from .pferd import Pferd |  | ||||||
|   | |||||||
							
								
								
									
										169
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										169
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,169 @@ | |||||||
|  | import argparse | ||||||
|  | import asyncio | ||||||
|  | import configparser | ||||||
|  | import os | ||||||
|  | import sys | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from .auth import AuthLoadError | ||||||
|  | from .cli import PARSER, ParserLoadError, load_default_section | ||||||
|  | from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError | ||||||
|  | from .logging import log | ||||||
|  | from .pferd import Pferd, PferdLoadError | ||||||
|  | from .transformer import RuleParseError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: | ||||||
|  |     log.explain_topic("Loading config") | ||||||
|  |     parser = configparser.ConfigParser(interpolation=None) | ||||||
|  |  | ||||||
|  |     if args.command is None: | ||||||
|  |         log.explain("No CLI command specified, loading config from file") | ||||||
|  |         Config.load_parser(parser, path=args.config) | ||||||
|  |     else: | ||||||
|  |         log.explain("CLI command specified, loading config from its arguments") | ||||||
|  |         if args.command: | ||||||
|  |             args.command(args, parser) | ||||||
|  |  | ||||||
|  |     load_default_section(args, parser) | ||||||
|  |  | ||||||
|  |     return parser | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_config(args: argparse.Namespace) -> Config: | ||||||
|  |     try: | ||||||
|  |         return Config(load_config_parser(args)) | ||||||
|  |     except ConfigLoadError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         log.error_contd(e.reason) | ||||||
|  |         sys.exit(1) | ||||||
|  |     except ParserLoadError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def configure_logging_from_args(args: argparse.Namespace) -> None: | ||||||
|  |     if args.explain is not None: | ||||||
|  |         log.output_explain = args.explain | ||||||
|  |     if args.status is not None: | ||||||
|  |         log.output_status = args.status | ||||||
|  |     if args.show_not_deleted is not None: | ||||||
|  |         log.output_not_deleted = args.show_not_deleted | ||||||
|  |     if args.report is not None: | ||||||
|  |         log.output_report = args.report | ||||||
|  |  | ||||||
|  |     # We want to prevent any unnecessary output if we're printing the config to | ||||||
|  |     # stdout, otherwise it would not be a valid config file. | ||||||
|  |     if args.dump_config_to == "-": | ||||||
|  |         log.output_explain = False | ||||||
|  |         log.output_status = False | ||||||
|  |         log.output_report = False | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None: | ||||||
|  |     # In configure_logging_from_args(), all normal logging is already disabled | ||||||
|  |     # whenever we dump the config. We don't want to override that decision with | ||||||
|  |     # values from the config file. | ||||||
|  |     if args.dump_config_to == "-": | ||||||
|  |         return | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         if args.explain is None: | ||||||
|  |             log.output_explain = config.default_section.explain() | ||||||
|  |         if args.status is None: | ||||||
|  |             log.output_status = config.default_section.status() | ||||||
|  |         if args.report is None: | ||||||
|  |             log.output_report = config.default_section.report() | ||||||
|  |         if args.show_not_deleted is None: | ||||||
|  |             log.output_not_deleted = config.default_section.show_not_deleted() | ||||||
|  |     except ConfigOptionError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def dump_config(args: argparse.Namespace, config: Config) -> None: | ||||||
|  |     log.explain_topic("Dumping config") | ||||||
|  |  | ||||||
|  |     if args.dump_config and args.dump_config_to is not None: | ||||||
|  |         log.error("--dump-config and --dump-config-to can't be specified at the same time") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         if args.dump_config: | ||||||
|  |             config.dump() | ||||||
|  |         elif args.dump_config_to == "-": | ||||||
|  |             config.dump_to_stdout() | ||||||
|  |         else: | ||||||
|  |             config.dump(Path(args.dump_config_to)) | ||||||
|  |     except ConfigDumpError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         log.error_contd(e.reason) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main() -> None: | ||||||
|  |     args = PARSER.parse_args() | ||||||
|  |  | ||||||
|  |     # Configuring logging happens in two stages because CLI args have | ||||||
|  |     # precedence over config file options and loading the config already | ||||||
|  |     # produces some kinds of log messages (usually only explain()-s). | ||||||
|  |     configure_logging_from_args(args) | ||||||
|  |  | ||||||
|  |     config = load_config(args) | ||||||
|  |  | ||||||
|  |     # Now, after loading the config file, we can apply its logging settings in | ||||||
|  |     # all places that were not already covered by CLI args. | ||||||
|  |     configure_logging_from_config(args, config) | ||||||
|  |  | ||||||
|  |     if args.dump_config or args.dump_config_to is not None: | ||||||
|  |         dump_config(args, config) | ||||||
|  |         sys.exit() | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         pferd = Pferd(config, args.crawler, args.skip) | ||||||
|  |     except PferdLoadError as e: | ||||||
|  |         log.unlock() | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         if os.name == "nt": | ||||||
|  |             # A "workaround" for the windows event loop somehow crashing after | ||||||
|  |             # asyncio.run() completes. See: | ||||||
|  |             # https://bugs.python.org/issue39232 | ||||||
|  |             # https://github.com/encode/httpx/issues/914#issuecomment-780023632 | ||||||
|  |             # TODO Fix this properly | ||||||
|  |             loop = asyncio.get_event_loop() | ||||||
|  |             loop.run_until_complete(pferd.run(args.debug_transforms)) | ||||||
|  |             loop.run_until_complete(asyncio.sleep(1)) | ||||||
|  |             loop.close() | ||||||
|  |         else: | ||||||
|  |             asyncio.run(pferd.run(args.debug_transforms)) | ||||||
|  |     except (ConfigOptionError, AuthLoadError) as e: | ||||||
|  |         log.unlock() | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |     except RuleParseError as e: | ||||||
|  |         log.unlock() | ||||||
|  |         e.pretty_print() | ||||||
|  |         sys.exit(1) | ||||||
|  |     except KeyboardInterrupt: | ||||||
|  |         log.unlock() | ||||||
|  |         log.explain_topic("Interrupted, exiting immediately") | ||||||
|  |         log.explain("Open files and connections are left for the OS to clean up") | ||||||
|  |         pferd.print_report() | ||||||
|  |         # TODO Clean up tmp files | ||||||
|  |         # And when those files *do* actually get cleaned up properly, | ||||||
|  |         # reconsider if this should really exit with 1 | ||||||
|  |         sys.exit(1) | ||||||
|  |     except Exception: | ||||||
|  |         log.unlock() | ||||||
|  |         log.unexpected_exception() | ||||||
|  |         pferd.print_report() | ||||||
|  |         sys.exit(1) | ||||||
|  |     else: | ||||||
|  |         pferd.print_report() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | |||||||
|  | from configparser import SectionProxy | ||||||
|  | from typing import Callable, Dict | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection  # noqa: F401 | ||||||
|  | from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection | ||||||
|  | from .keyring import KeyringAuthenticator, KeyringAuthSection | ||||||
|  | from .pass_ import PassAuthenticator, PassAuthSection | ||||||
|  | from .simple import SimpleAuthenticator, SimpleAuthSection | ||||||
|  | from .tfa import TfaAuthenticator | ||||||
|  |  | ||||||
|  | AuthConstructor = Callable[[ | ||||||
|  |     str,                # Name (without the "auth:" prefix) | ||||||
|  |     SectionProxy,       # Authenticator's section of global config | ||||||
|  |     Config,             # Global config | ||||||
|  | ], Authenticator] | ||||||
|  |  | ||||||
|  | AUTHENTICATORS: Dict[str, AuthConstructor] = { | ||||||
|  |     "credential-file": lambda n, s, c: | ||||||
|  |         CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), | ||||||
|  |     "keyring": lambda n, s, c: | ||||||
|  |         KeyringAuthenticator(n, KeyringAuthSection(s)), | ||||||
|  |     "pass": lambda n, s, c: | ||||||
|  |         PassAuthenticator(n, PassAuthSection(s)), | ||||||
|  |     "simple": lambda n, s, c: | ||||||
|  |         SimpleAuthenticator(n, SimpleAuthSection(s)), | ||||||
|  |     "tfa": lambda n, s, c: | ||||||
|  |         TfaAuthenticator(n), | ||||||
|  | } | ||||||
							
								
								
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | |||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from typing import Tuple | ||||||
|  |  | ||||||
|  | from ..config import Section | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class AuthLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class AuthError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class AuthSection(Section): | ||||||
|  |     def type(self) -> str: | ||||||
|  |         value = self.s.get("type") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("type") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Authenticator(ABC): | ||||||
|  |     def __init__(self, name: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Initialize an authenticator from its name and its section in the config | ||||||
|  |         file. | ||||||
|  |  | ||||||
|  |         If you are writing your own constructor for your own authenticator, | ||||||
|  |         make sure to call this constructor first (via super().__init__). | ||||||
|  |  | ||||||
|  |         May throw an AuthLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.name = name | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     async def username(self) -> str: | ||||||
|  |         username, _ = await self.credentials() | ||||||
|  |         return username | ||||||
|  |  | ||||||
|  |     async def password(self) -> str: | ||||||
|  |         _, password = await self.credentials() | ||||||
|  |         return password | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Tell the authenticator that some or all of its credentials are invalid. | ||||||
|  |  | ||||||
|  |         Authenticators should overwrite this function if they have a way to | ||||||
|  |         deal with this issue that is likely to result in valid credentials | ||||||
|  |         (e. g. prompting the user). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         raise AuthError("Invalid credentials") | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Tell the authenticator that specifically its username is invalid. | ||||||
|  |  | ||||||
|  |         Authenticators should overwrite this function if they have a way to | ||||||
|  |         deal with this issue that is likely to result in valid credentials | ||||||
|  |         (e. g. prompting the user). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         raise AuthError("Invalid username") | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Tell the authenticator that specifically its password is invalid. | ||||||
|  |  | ||||||
|  |         Authenticators should overwrite this function if they have a way to | ||||||
|  |         deal with this issue that is likely to result in valid credentials | ||||||
|  |         (e. g. prompting the user). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         raise AuthError("Invalid password") | ||||||
							
								
								
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | |||||||
|  | from pathlib import Path | ||||||
|  | from typing import Tuple | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from ..utils import fmt_real_path | ||||||
|  | from .authenticator import Authenticator, AuthLoadError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CredentialFileAuthSection(AuthSection): | ||||||
|  |     def path(self) -> Path: | ||||||
|  |         value = self.s.get("path") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("path") | ||||||
|  |         return Path(value) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CredentialFileAuthenticator(Authenticator): | ||||||
|  |     def __init__(self, name: str, section: CredentialFileAuthSection, config: Config) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         path = config.default_section.working_dir() / section.path() | ||||||
|  |         try: | ||||||
|  |             with open(path, encoding="utf-8") as f: | ||||||
|  |                 lines = list(f) | ||||||
|  |         except UnicodeDecodeError: | ||||||
|  |             raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8") | ||||||
|  |         except OSError as e: | ||||||
|  |             raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e | ||||||
|  |  | ||||||
|  |         if len(lines) != 2: | ||||||
|  |             raise AuthLoadError("Credential file must be two lines long") | ||||||
|  |         [uline, pline] = lines | ||||||
|  |         uline = uline[:-1]  # Remove trailing newline | ||||||
|  |         if pline.endswith("\n"): | ||||||
|  |             pline = pline[:-1] | ||||||
|  |  | ||||||
|  |         if not uline.startswith("username="): | ||||||
|  |             raise AuthLoadError("First line must start with 'username='") | ||||||
|  |         if not pline.startswith("password="): | ||||||
|  |             raise AuthLoadError("Second line must start with 'password='") | ||||||
|  |  | ||||||
|  |         self._username = uline[9:] | ||||||
|  |         self._password = pline[9:] | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         return self._username, self._password | ||||||
							
								
								
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,65 @@ | |||||||
|  | from typing import Optional, Tuple, cast | ||||||
|  |  | ||||||
|  | import keyring | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import agetpass, ainput | ||||||
|  | from ..version import NAME | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KeyringAuthSection(AuthSection): | ||||||
|  |     def username(self) -> Optional[str]: | ||||||
|  |         return self.s.get("username") | ||||||
|  |  | ||||||
|  |     def keyring_name(self) -> str: | ||||||
|  |         return cast(str, self.s.get("keyring_name", fallback=NAME)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KeyringAuthenticator(Authenticator): | ||||||
|  |  | ||||||
|  |     def __init__(self, name: str, section: KeyringAuthSection) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         self._username = section.username() | ||||||
|  |         self._password: Optional[str] = None | ||||||
|  |         self._keyring_name = section.keyring_name() | ||||||
|  |  | ||||||
|  |         self._password_invalidated = False | ||||||
|  |         self._username_fixed = section.username() is not None | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         # Request the username | ||||||
|  |         if self._username is None: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 self._username = await ainput("Username: ") | ||||||
|  |  | ||||||
|  |         # First try looking it up in the keyring. | ||||||
|  |         # Do not look it up if it was invalidated - we want to re-prompt in this case | ||||||
|  |         if self._password is None and not self._password_invalidated: | ||||||
|  |             self._password = keyring.get_password(self._keyring_name, self._username) | ||||||
|  |  | ||||||
|  |         # If that fails it wasn't saved in the keyring - we need to | ||||||
|  |         # read it from the user and store it | ||||||
|  |         if self._password is None: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 self._password = await agetpass("Password: ") | ||||||
|  |                 keyring.set_password(self._keyring_name, self._username, self._password) | ||||||
|  |  | ||||||
|  |         self._password_invalidated = False | ||||||
|  |         return self._username, self._password | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         if not self._username_fixed: | ||||||
|  |             self.invalidate_username() | ||||||
|  |         self.invalidate_password() | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         if self._username_fixed: | ||||||
|  |             raise AuthError("Configured username is invalid") | ||||||
|  |         else: | ||||||
|  |             self._username = None | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         self._password = None | ||||||
|  |         self._password_invalidated = True | ||||||
							
								
								
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,98 @@ | |||||||
|  | import re | ||||||
|  | import subprocess | ||||||
|  | from typing import List, Tuple | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PassAuthSection(AuthSection): | ||||||
|  |     def passname(self) -> str: | ||||||
|  |         if (value := self.s.get("passname")) is None: | ||||||
|  |             self.missing_value("passname") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def username_prefixes(self) -> List[str]: | ||||||
|  |         value = self.s.get("username_prefixes", "login,username,user") | ||||||
|  |         return [prefix.lower() for prefix in value.split(",")] | ||||||
|  |  | ||||||
|  |     def password_prefixes(self) -> List[str]: | ||||||
|  |         value = self.s.get("password_prefixes", "password,pass,secret") | ||||||
|  |         return [prefix.lower() for prefix in value.split(",")] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PassAuthenticator(Authenticator): | ||||||
|  |     PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)"  # to be used with fullmatch | ||||||
|  |  | ||||||
|  |     def __init__(self, name: str, section: PassAuthSection) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         self._passname = section.passname() | ||||||
|  |         self._username_prefixes = section.username_prefixes() | ||||||
|  |         self._password_prefixes = section.password_prefixes() | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         log.explain_topic("Obtaining credentials from pass") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             log.explain(f"Calling 'pass show {self._passname}'") | ||||||
|  |             result = subprocess.check_output(["pass", "show", self._passname], text=True) | ||||||
|  |         except subprocess.CalledProcessError as e: | ||||||
|  |             raise AuthError(f"Failed to get password info from {self._passname}: {e}") | ||||||
|  |  | ||||||
|  |         prefixed = {} | ||||||
|  |         unprefixed = [] | ||||||
|  |         for line in result.strip().splitlines(): | ||||||
|  |             if match := re.fullmatch(self.PREFIXED_LINE_RE, line): | ||||||
|  |                 prefix = match.group(1).lower() | ||||||
|  |                 value = match.group(2) | ||||||
|  |                 log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") | ||||||
|  |                 if prefix in prefixed: | ||||||
|  |                     raise AuthError(f"Prefix {prefix} specified multiple times") | ||||||
|  |                 prefixed[prefix] = value | ||||||
|  |             else: | ||||||
|  |                 log.explain(f"Found unprefixed line {line!r}") | ||||||
|  |                 unprefixed.append(line) | ||||||
|  |  | ||||||
|  |         username = None | ||||||
|  |         for prefix in self._username_prefixes: | ||||||
|  |             log.explain(f"Looking for username at prefix {prefix!r}") | ||||||
|  |             if prefix in prefixed: | ||||||
|  |                 username = prefixed[prefix] | ||||||
|  |                 log.explain(f"Found username {username!r}") | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |         password = None | ||||||
|  |         for prefix in self._password_prefixes: | ||||||
|  |             log.explain(f"Looking for password at prefix {prefix!r}") | ||||||
|  |             if prefix in prefixed: | ||||||
|  |                 password = prefixed[prefix] | ||||||
|  |                 log.explain(f"Found password {password!r}") | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |         if password is None and username is None: | ||||||
|  |             log.explain("No username and password found so far") | ||||||
|  |             log.explain("Using first unprefixed line as password") | ||||||
|  |             log.explain("Using second unprefixed line as username") | ||||||
|  |         elif password is None: | ||||||
|  |             log.explain("No password found so far") | ||||||
|  |             log.explain("Using first unprefixed line as password") | ||||||
|  |         elif username is None: | ||||||
|  |             log.explain("No username found so far") | ||||||
|  |             log.explain("Using first unprefixed line as username") | ||||||
|  |  | ||||||
|  |         if password is None: | ||||||
|  |             if not unprefixed: | ||||||
|  |                 log.explain("Not enough unprefixed lines left") | ||||||
|  |                 raise AuthError("Password could not be determined") | ||||||
|  |             password = unprefixed.pop(0) | ||||||
|  |             log.explain(f"Found password {password!r}") | ||||||
|  |  | ||||||
|  |         if username is None: | ||||||
|  |             if not unprefixed: | ||||||
|  |                 log.explain("Not enough unprefixed lines left") | ||||||
|  |                 raise AuthError("Username could not be determined") | ||||||
|  |             username = unprefixed.pop(0) | ||||||
|  |             log.explain(f"Found username {username!r}") | ||||||
|  |  | ||||||
|  |         return username, password | ||||||
							
								
								
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | |||||||
|  | from typing import Optional, Tuple | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import agetpass, ainput | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class SimpleAuthSection(AuthSection): | ||||||
|  |     def username(self) -> Optional[str]: | ||||||
|  |         return self.s.get("username") | ||||||
|  |  | ||||||
|  |     def password(self) -> Optional[str]: | ||||||
|  |         return self.s.get("password") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class SimpleAuthenticator(Authenticator): | ||||||
|  |     def __init__(self, name: str, section: SimpleAuthSection) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         self._username = section.username() | ||||||
|  |         self._password = section.password() | ||||||
|  |  | ||||||
|  |         self._username_fixed = self.username is not None | ||||||
|  |         self._password_fixed = self.password is not None | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         if self._username is not None and self._password is not None: | ||||||
|  |             return self._username, self._password | ||||||
|  |  | ||||||
|  |         async with log.exclusive_output(): | ||||||
|  |             if self._username is None: | ||||||
|  |                 self._username = await ainput("Username: ") | ||||||
|  |             else: | ||||||
|  |                 print(f"Username: {self._username}") | ||||||
|  |  | ||||||
|  |             if self._password is None: | ||||||
|  |                 self._password = await agetpass("Password: ") | ||||||
|  |  | ||||||
|  |             # Intentionally returned inside the context manager so we know | ||||||
|  |             # they're both not None | ||||||
|  |             return self._username, self._password | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         if self._username_fixed and self._password_fixed: | ||||||
|  |             raise AuthError("Configured credentials are invalid") | ||||||
|  |  | ||||||
|  |         if not self._username_fixed: | ||||||
|  |             self._username = None | ||||||
|  |         if not self._password_fixed: | ||||||
|  |             self._password = None | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         if self._username_fixed: | ||||||
|  |             raise AuthError("Configured username is invalid") | ||||||
|  |         else: | ||||||
|  |             self._username = None | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         if self._password_fixed: | ||||||
|  |             raise AuthError("Configured password is invalid") | ||||||
|  |         else: | ||||||
|  |             self._password = None | ||||||
							
								
								
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | |||||||
|  | from typing import Tuple | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import ainput | ||||||
|  | from .authenticator import Authenticator, AuthError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TfaAuthenticator(Authenticator): | ||||||
|  |     def __init__(self, name: str) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |     async def username(self) -> str: | ||||||
|  |         raise AuthError("TFA authenticator does not support usernames") | ||||||
|  |  | ||||||
|  |     async def password(self) -> str: | ||||||
|  |         async with log.exclusive_output(): | ||||||
|  |             code = await ainput("TFA code: ") | ||||||
|  |             return code | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         raise AuthError("TFA authenticator does not support usernames") | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         raise AuthError("TFA authenticator does not support usernames") | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         pass | ||||||
| @@ -1,125 +0,0 @@ | |||||||
| """ |  | ||||||
| General authenticators useful in many situations |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import getpass |  | ||||||
| from typing import Optional, Tuple |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TfaAuthenticator: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     An authenticator for a TFA token. Always prompts the user, as the token can not be cached. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, reason: str): |  | ||||||
|         """ |  | ||||||
|         Create a new tfa authenticator. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             reason {str} -- the reason for obtaining the credentials |  | ||||||
|         """ |  | ||||||
|         self._reason = reason |  | ||||||
|  |  | ||||||
|     def get_token(self) -> str: |  | ||||||
|         # pylint: disable=no-self-use |  | ||||||
|         """ |  | ||||||
|         Prompts the user for the token and returns it. |  | ||||||
|         """ |  | ||||||
|         print(f"Enter credentials ({self._reason})") |  | ||||||
|         return getpass.getpass("TFA Token: ") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class UserPassAuthenticator: |  | ||||||
|     """ |  | ||||||
|     An authenticator for username-password combinations that prompts the user |  | ||||||
|     for missing information. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             reason: str, |  | ||||||
|             username: Optional[str] = None, |  | ||||||
|             password: Optional[str] = None, |  | ||||||
|     ) -> None: |  | ||||||
|         """ |  | ||||||
|         reason   - what the credentials are used for |  | ||||||
|         username - the username (if already known) |  | ||||||
|         password - the password (if already known) |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._reason = reason |  | ||||||
|  |  | ||||||
|         self._given_username = username |  | ||||||
|         self._given_password = password |  | ||||||
|  |  | ||||||
|         self._username = username |  | ||||||
|         self._password = password |  | ||||||
|  |  | ||||||
|     def get_credentials(self) -> Tuple[str, str]: |  | ||||||
|         """ |  | ||||||
|         Returns a tuple (username, password). Prompts user for username or |  | ||||||
|         password when necessary. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if self._username is None and self._given_username is not None: |  | ||||||
|             self._username = self._given_username |  | ||||||
|  |  | ||||||
|         if self._password is None and self._given_password is not None: |  | ||||||
|             self._password = self._given_password |  | ||||||
|  |  | ||||||
|         if self._username is None or self._password is None: |  | ||||||
|             print(f"Enter credentials ({self._reason})") |  | ||||||
|  |  | ||||||
|         username: str |  | ||||||
|         if self._username is None: |  | ||||||
|             username = input("Username: ") |  | ||||||
|             self._username = username |  | ||||||
|         else: |  | ||||||
|             username = self._username |  | ||||||
|  |  | ||||||
|         password: str |  | ||||||
|         if self._password is None: |  | ||||||
|             password = getpass.getpass(prompt="Password: ") |  | ||||||
|             self._password = password |  | ||||||
|         else: |  | ||||||
|             password = self._password |  | ||||||
|  |  | ||||||
|         return (username, password) |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def username(self) -> str: |  | ||||||
|         """ |  | ||||||
|         The username. Accessing this property may cause the authenticator to |  | ||||||
|         prompt the user. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         (username, _) = self.get_credentials() |  | ||||||
|         return username |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def password(self) -> str: |  | ||||||
|         """ |  | ||||||
|         The password. Accessing this property may cause the authenticator to |  | ||||||
|         prompt the user. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         (_, password) = self.get_credentials() |  | ||||||
|         return password |  | ||||||
|  |  | ||||||
|     def invalidate_credentials(self) -> None: |  | ||||||
|         """ |  | ||||||
|         Marks the credentials as invalid. If only a username was supplied in |  | ||||||
|         the constructor, assumes that the username is valid and only the |  | ||||||
|         password is invalid. If only a password was supplied in the |  | ||||||
|         constructor, assumes that the password is valid and only the username |  | ||||||
|         is invalid. Otherwise, assumes that username and password are both |  | ||||||
|         invalid. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._username = None |  | ||||||
|         self._password = None |  | ||||||
|  |  | ||||||
|         if self._given_username is not None and self._given_password is not None: |  | ||||||
|             self._given_username = None |  | ||||||
|             self._given_password = None |  | ||||||
							
								
								
									
										14
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | # isort: skip_file | ||||||
|  |  | ||||||
|  | # The order of imports matters because each command module registers itself | ||||||
|  | # with the parser from ".parser" and the import order affects the order in | ||||||
|  | # which they appear in the help. Because of this, isort is disabled for this | ||||||
|  | # file. Also, since we're reexporting or just using the side effect of | ||||||
|  | # importing itself, we get a few linting warnings, which we're disabling as | ||||||
|  | # well. | ||||||
|  |  | ||||||
|  | from . import command_local  # noqa: F401 imported but unused | ||||||
|  | from . import command_ilias_web  # noqa: F401 imported but unused | ||||||
|  | from . import command_kit_ilias_web  # noqa: F401 imported but unused | ||||||
|  | from . import command_kit_ipd  # noqa: F401 imported but unused | ||||||
|  | from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused | ||||||
							
								
								
									
										56
									
								
								PFERD/cli/command_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								PFERD/cli/command_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .common_ilias_args import configure_common_group_args, load_common | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | COMMAND_NAME = "ilias-web" | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     COMMAND_NAME, | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title=f"{COMMAND_NAME} crawler arguments", | ||||||
|  |     description=f"arguments for the '{COMMAND_NAME}' crawler", | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--base-url", | ||||||
|  |     type=str, | ||||||
|  |     metavar="BASE_URL", | ||||||
|  |     help="The base url of the ilias instance" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--client-id", | ||||||
|  |     type=str, | ||||||
|  |     metavar="CLIENT_ID", | ||||||
|  |     help="The client id of the ilias instance" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | configure_common_group_args(GROUP) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain(f"Creating config for command '{COMMAND_NAME}'") | ||||||
|  |  | ||||||
|  |     parser["crawl:ilias"] = {} | ||||||
|  |     section = parser["crawl:ilias"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = COMMAND_NAME | ||||||
|  |     if args.ilias_url is not None: | ||||||
|  |         section["base_url"] = args.ilias_url | ||||||
|  |     if args.client_id is not None: | ||||||
|  |         section["client_id"] = args.client_id | ||||||
|  |  | ||||||
|  |     load_common(section, args, parser) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										37
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .common_ilias_args import configure_common_group_args, load_common | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | COMMAND_NAME = "kit-ilias-web" | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     COMMAND_NAME, | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title=f"{COMMAND_NAME} crawler arguments", | ||||||
|  |     description=f"arguments for the '{COMMAND_NAME}' crawler", | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | configure_common_group_args(GROUP) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain(f"Creating config for command '{COMMAND_NAME}'") | ||||||
|  |  | ||||||
|  |     parser["crawl:ilias"] = {} | ||||||
|  |     section = parser["crawl:ilias"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = COMMAND_NAME | ||||||
|  |     load_common(section, args, parser) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     "kit-ipd", | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title="kit ipd crawler arguments", | ||||||
|  |     description="arguments for the 'kit-ipd' crawler", | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--link-regex", | ||||||
|  |     type=str, | ||||||
|  |     metavar="REGEX", | ||||||
|  |     help="href-matching regex to identify downloadable files" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "target", | ||||||
|  |     type=str, | ||||||
|  |     metavar="TARGET", | ||||||
|  |     help="url to crawl" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "output", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="OUTPUT", | ||||||
|  |     help="output directory" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain("Creating config for command 'kit-ipd'") | ||||||
|  |  | ||||||
|  |     parser["crawl:kit-ipd"] = {} | ||||||
|  |     section = parser["crawl:kit-ipd"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = "kit-ipd" | ||||||
|  |     section["target"] = str(args.target) | ||||||
|  |     section["output_dir"] = str(args.output) | ||||||
|  |     if args.link_regex: | ||||||
|  |         section["link_regex"] = str(args.link_regex) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     "local", | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title="local crawler arguments", | ||||||
|  |     description="arguments for the 'local' crawler", | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "target", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="TARGET", | ||||||
|  |     help="directory to crawl" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "output", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="OUTPUT", | ||||||
|  |     help="output directory" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--crawl-delay", | ||||||
|  |     type=float, | ||||||
|  |     metavar="SECONDS", | ||||||
|  |     help="artificial delay to simulate for crawl requests" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--download-delay", | ||||||
|  |     type=float, | ||||||
|  |     metavar="SECONDS", | ||||||
|  |     help="artificial delay to simulate for download requests" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--download-speed", | ||||||
|  |     type=int, | ||||||
|  |     metavar="BYTES_PER_SECOND", | ||||||
|  |     help="download speed to simulate" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain("Creating config for command 'local'") | ||||||
|  |  | ||||||
|  |     parser["crawl:local"] = {} | ||||||
|  |     section = parser["crawl:local"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = "local" | ||||||
|  |     section["target"] = str(args.target) | ||||||
|  |     section["output_dir"] = str(args.output) | ||||||
|  |     if args.crawl_delay is not None: | ||||||
|  |         section["crawl_delay"] = str(args.crawl_delay) | ||||||
|  |     if args.download_delay is not None: | ||||||
|  |         section["download_delay"] = str(args.download_delay) | ||||||
|  |     if args.download_speed is not None: | ||||||
|  |         section["download_speed"] = str(args.download_speed) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										104
									
								
								PFERD/cli/common_ilias_args.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								PFERD/cli/common_ilias_args.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from ..crawl.ilias.file_templates import Links | ||||||
|  | from .parser import BooleanOptionalAction, ParserLoadError, show_value_error | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def configure_common_group_args(group: argparse._ArgumentGroup) -> None: | ||||||
|  |     """These arguments are shared between the KIT and generic Ilias web command.""" | ||||||
|  |     group.add_argument( | ||||||
|  |         "target", | ||||||
|  |         type=str, | ||||||
|  |         metavar="TARGET", | ||||||
|  |         help="course id, 'desktop', or ILIAS URL to crawl" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "output", | ||||||
|  |         type=Path, | ||||||
|  |         metavar="OUTPUT", | ||||||
|  |         help="output directory" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--username", "-u", | ||||||
|  |         type=str, | ||||||
|  |         metavar="USERNAME", | ||||||
|  |         help="user name for authentication" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--keyring", | ||||||
|  |         action=BooleanOptionalAction, | ||||||
|  |         help="use the system keyring to store and retrieve passwords" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--credential-file", | ||||||
|  |         type=Path, | ||||||
|  |         metavar="PATH", | ||||||
|  |         help="read username and password from a credential file" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--links", | ||||||
|  |         type=show_value_error(Links.from_string), | ||||||
|  |         metavar="OPTION", | ||||||
|  |         help="how to represent external links" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--link-redirect-delay", | ||||||
|  |         type=int, | ||||||
|  |         metavar="SECONDS", | ||||||
|  |         help="time before 'fancy' links redirect to to their target (-1 to disable)" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--videos", | ||||||
|  |         action=BooleanOptionalAction, | ||||||
|  |         help="crawl and download videos" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--forums", | ||||||
|  |         action=BooleanOptionalAction, | ||||||
|  |         help="crawl and download forum posts" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--http-timeout", "-t", | ||||||
|  |         type=float, | ||||||
|  |         metavar="SECONDS", | ||||||
|  |         help="timeout for all HTTP requests" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_common( | ||||||
|  |     section: configparser.SectionProxy, | ||||||
|  |     args: argparse.Namespace, | ||||||
|  |     parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     """Load common config between generic and KIT ilias web command""" | ||||||
|  |     section["target"] = str(args.target) | ||||||
|  |     section["output_dir"] = str(args.output) | ||||||
|  |     section["auth"] = "auth:ilias" | ||||||
|  |     if args.links is not None: | ||||||
|  |         section["links"] = str(args.links.value) | ||||||
|  |     if args.link_redirect_delay is not None: | ||||||
|  |         section["link_redirect_delay"] = str(args.link_redirect_delay) | ||||||
|  |     if args.videos is not None: | ||||||
|  |         section["videos"] = "yes" if args.videos else "no" | ||||||
|  |     if args.forums is not None: | ||||||
|  |         section["forums"] = "yes" if args.forums else "no" | ||||||
|  |     if args.http_timeout is not None: | ||||||
|  |         section["http_timeout"] = str(args.http_timeout) | ||||||
|  |  | ||||||
|  |     parser["auth:ilias"] = {} | ||||||
|  |     auth_section = parser["auth:ilias"] | ||||||
|  |     if args.credential_file is not None: | ||||||
|  |         if args.username is not None: | ||||||
|  |             raise ParserLoadError("--credential-file and --username can't be used together") | ||||||
|  |         if args.keyring: | ||||||
|  |             raise ParserLoadError("--credential-file and --keyring can't be used together") | ||||||
|  |         auth_section["type"] = "credential-file" | ||||||
|  |         auth_section["path"] = str(args.credential_file) | ||||||
|  |     elif args.keyring: | ||||||
|  |         auth_section["type"] = "keyring" | ||||||
|  |     else: | ||||||
|  |         auth_section["type"] = "simple" | ||||||
|  |     if args.username is not None: | ||||||
|  |         auth_section["username"] = args.username | ||||||
							
								
								
									
										245
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										245
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,245 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from argparse import ArgumentTypeError | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Callable, List, Optional, Sequence, Union | ||||||
|  |  | ||||||
|  | from ..output_dir import OnConflict, Redownload | ||||||
|  | from ..version import NAME, VERSION | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ParserLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # TODO Replace with argparse version when updating to 3.9? | ||||||
|  | class BooleanOptionalAction(argparse.Action): | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             option_strings: List[str], | ||||||
|  |             dest: Any, | ||||||
|  |             default: Any = None, | ||||||
|  |             type: Any = None, | ||||||
|  |             choices: Any = None, | ||||||
|  |             required: Any = False, | ||||||
|  |             help: Any = None, | ||||||
|  |             metavar: Any = None, | ||||||
|  |     ): | ||||||
|  |         if len(option_strings) != 1: | ||||||
|  |             raise ValueError("There must be exactly one option string") | ||||||
|  |         [self.name] = option_strings | ||||||
|  |         if not self.name.startswith("--"): | ||||||
|  |             raise ValueError(f"{self.name!r} doesn't start with '--'") | ||||||
|  |         if self.name.startswith("--no-"): | ||||||
|  |             raise ValueError(f"{self.name!r} starts with '--no-'") | ||||||
|  |  | ||||||
|  |         options = [self.name, "--no-" + self.name[2:]] | ||||||
|  |  | ||||||
|  |         super().__init__( | ||||||
|  |             options, | ||||||
|  |             dest, | ||||||
|  |             nargs=0, | ||||||
|  |             default=default, | ||||||
|  |             type=type, | ||||||
|  |             choices=choices, | ||||||
|  |             required=required, | ||||||
|  |             help=help, | ||||||
|  |             metavar=metavar, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def __call__( | ||||||
|  |             self, | ||||||
|  |             parser: argparse.ArgumentParser, | ||||||
|  |             namespace: argparse.Namespace, | ||||||
|  |             values: Union[str, Sequence[Any], None], | ||||||
|  |             option_string: Optional[str] = None, | ||||||
|  |     ) -> None: | ||||||
|  |         if option_string and option_string in self.option_strings: | ||||||
|  |             value = not option_string.startswith("--no-") | ||||||
|  |             setattr(namespace, self.dest, value) | ||||||
|  |  | ||||||
|  |     def format_usage(self) -> str: | ||||||
|  |         return "--[no-]" + self.name[2:] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def show_value_error(inner: Callable[[str], Any]) -> Callable[[str], Any]: | ||||||
|  |     """ | ||||||
|  |     Some validation functions (like the from_string in our enums) raise a ValueError. | ||||||
|  |     Argparse only pretty-prints ArgumentTypeErrors though, so we need to wrap our ValueErrors. | ||||||
|  |     """ | ||||||
|  |     def wrapper(input: str) -> Any: | ||||||
|  |         try: | ||||||
|  |             return inner(input) | ||||||
|  |         except ValueError as e: | ||||||
|  |             raise ArgumentTypeError(e) | ||||||
|  |     return wrapper | ||||||
|  |  | ||||||
|  |  | ||||||
|  | CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) | ||||||
|  | CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( | ||||||
|  |     title="general crawler arguments", | ||||||
|  |     description="arguments common to all crawlers", | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--redownload", "-r", | ||||||
|  |     type=show_value_error(Redownload.from_string), | ||||||
|  |     metavar="OPTION", | ||||||
|  |     help="when to download a file that's already present locally" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--on-conflict", | ||||||
|  |     type=show_value_error(OnConflict.from_string), | ||||||
|  |     metavar="OPTION", | ||||||
|  |     help="what to do when local and remote files or directories differ" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--transform", "-T", | ||||||
|  |     action="append", | ||||||
|  |     type=str, | ||||||
|  |     metavar="RULE", | ||||||
|  |     help="add a single transformation rule. Can be specified multiple times" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--tasks", "-n", | ||||||
|  |     type=int, | ||||||
|  |     metavar="N", | ||||||
|  |     help="maximum number of concurrent tasks (crawling, downloading)" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--downloads", "-N", | ||||||
|  |     type=int, | ||||||
|  |     metavar="N", | ||||||
|  |     help="maximum number of tasks that may download data at the same time" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--task-delay", "-d", | ||||||
|  |     type=float, | ||||||
|  |     metavar="SECONDS", | ||||||
|  |     help="time the crawler should wait between subsequent tasks" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--windows-paths", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="whether to repair invalid paths on windows" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_crawler( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         section: configparser.SectionProxy, | ||||||
|  | ) -> None: | ||||||
|  |     if args.redownload is not None: | ||||||
|  |         section["redownload"] = args.redownload.value | ||||||
|  |     if args.on_conflict is not None: | ||||||
|  |         section["on_conflict"] = args.on_conflict.value | ||||||
|  |     if args.transform is not None: | ||||||
|  |         section["transform"] = "\n" + "\n".join(args.transform) | ||||||
|  |     if args.tasks is not None: | ||||||
|  |         section["tasks"] = str(args.tasks) | ||||||
|  |     if args.downloads is not None: | ||||||
|  |         section["downloads"] = str(args.downloads) | ||||||
|  |     if args.task_delay is not None: | ||||||
|  |         section["task_delay"] = str(args.task_delay) | ||||||
|  |     if args.windows_paths is not None: | ||||||
|  |         section["windows_paths"] = "yes" if args.windows_paths else "no" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PARSER = argparse.ArgumentParser() | ||||||
|  | PARSER.set_defaults(command=None) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--version", | ||||||
|  |     action="version", | ||||||
|  |     version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--config", "-c", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="PATH", | ||||||
|  |     help="custom config file" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--dump-config", | ||||||
|  |     action="store_true", | ||||||
|  |     help="dump current configuration to the default config path and exit" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--dump-config-to", | ||||||
|  |     metavar="PATH", | ||||||
|  |     help="dump current configuration to a file and exit." | ||||||
|  |     " Use '-' as path to print to stdout instead" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--debug-transforms", | ||||||
|  |     action="store_true", | ||||||
|  |     help="apply transform rules to files of previous run" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--crawler", "-C", | ||||||
|  |     action="append", | ||||||
|  |     type=str, | ||||||
|  |     metavar="NAME", | ||||||
|  |     help="only execute a single crawler." | ||||||
|  |     " Can be specified multiple times to execute multiple crawlers" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--skip", "-S", | ||||||
|  |     action="append", | ||||||
|  |     type=str, | ||||||
|  |     metavar="NAME", | ||||||
|  |     help="don't execute this particular crawler." | ||||||
|  |     " Can be specified multiple times to skip multiple crawlers" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--working-dir", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="PATH", | ||||||
|  |     help="custom working directory" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--explain", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="log and explain in detail what PFERD is doing" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--status", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="print status updates while PFERD is crawling" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--report", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="print a report of all local changes before exiting" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--share-cookies", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="whether crawlers should share cookies where applicable" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--show-not-deleted", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="print messages in status and report when PFERD did not delete a local only file" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_default_section( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     section = parser[parser.default_section] | ||||||
|  |  | ||||||
|  |     if args.working_dir is not None: | ||||||
|  |         section["working_dir"] = str(args.working_dir) | ||||||
|  |     if args.explain is not None: | ||||||
|  |         section["explain"] = "yes" if args.explain else "no" | ||||||
|  |     if args.status is not None: | ||||||
|  |         section["status"] = "yes" if args.status else "no" | ||||||
|  |     if args.report is not None: | ||||||
|  |         section["report"] = "yes" if args.report else "no" | ||||||
|  |     if args.share_cookies is not None: | ||||||
|  |         section["share_cookies"] = "yes" if args.share_cookies else "no" | ||||||
|  |     if args.show_not_deleted is not None: | ||||||
|  |         section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSERS = PARSER.add_subparsers(title="crawlers") | ||||||
							
								
								
									
										193
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										193
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,193 @@ | |||||||
|  | import asyncio | ||||||
|  | import os | ||||||
|  | import sys | ||||||
|  | from configparser import ConfigParser, SectionProxy | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, List, NoReturn, Optional, Tuple | ||||||
|  |  | ||||||
|  | from rich.markup import escape | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_real_path, prompt_yes_no | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConfigLoadError(Exception): | ||||||
|  |     """ | ||||||
|  |     Something went wrong while loading the config from a file. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, path: Path, reason: str): | ||||||
|  |         super().__init__(f"Failed to load config from {fmt_real_path(path)}") | ||||||
|  |         self.path = path | ||||||
|  |         self.reason = reason | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConfigOptionError(Exception): | ||||||
|  |     """ | ||||||
|  |     An option in the config file has an invalid or missing value. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, section: str, key: str, desc: str): | ||||||
|  |         super().__init__(f"Section {section!r}, key {key!r}: {desc}") | ||||||
|  |         self.section = section | ||||||
|  |         self.key = key | ||||||
|  |         self.desc = desc | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConfigDumpError(Exception): | ||||||
|  |     def __init__(self, path: Path, reason: str): | ||||||
|  |         super().__init__(f"Failed to dump config to {fmt_real_path(path)}") | ||||||
|  |         self.path = path | ||||||
|  |         self.reason = reason | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Section: | ||||||
|  |     """ | ||||||
|  |     Base class for the crawler and auth section classes. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, section: SectionProxy): | ||||||
|  |         self.s = section | ||||||
|  |  | ||||||
|  |     def error(self, key: str, desc: str) -> NoReturn: | ||||||
|  |         raise ConfigOptionError(self.s.name, key, desc) | ||||||
|  |  | ||||||
|  |     def invalid_value( | ||||||
|  |             self, | ||||||
|  |             key: str, | ||||||
|  |             value: Any, | ||||||
|  |             reason: Optional[str], | ||||||
|  |     ) -> NoReturn: | ||||||
|  |         if reason is None: | ||||||
|  |             self.error(key, f"Invalid value {value!r}") | ||||||
|  |         else: | ||||||
|  |             self.error(key, f"Invalid value {value!r}: {reason}") | ||||||
|  |  | ||||||
|  |     def missing_value(self, key: str) -> NoReturn: | ||||||
|  |         self.error(key, "Missing value") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DefaultSection(Section): | ||||||
|  |     def working_dir(self) -> Path: | ||||||
|  |         # TODO Change to working dir instead of manually prepending it to paths | ||||||
|  |         pathstr = self.s.get("working_dir", ".") | ||||||
|  |         return Path(pathstr).expanduser() | ||||||
|  |  | ||||||
|  |     def explain(self) -> bool: | ||||||
|  |         return self.s.getboolean("explain", fallback=False) | ||||||
|  |  | ||||||
|  |     def status(self) -> bool: | ||||||
|  |         return self.s.getboolean("status", fallback=True) | ||||||
|  |  | ||||||
|  |     def report(self) -> bool: | ||||||
|  |         return self.s.getboolean("report", fallback=True) | ||||||
|  |  | ||||||
|  |     def show_not_deleted(self) -> bool: | ||||||
|  |         return self.s.getboolean("show_not_deleted", fallback=True) | ||||||
|  |  | ||||||
|  |     def share_cookies(self) -> bool: | ||||||
|  |         return self.s.getboolean("share_cookies", fallback=True) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Config: | ||||||
|  |     @staticmethod | ||||||
|  |     def _default_path() -> Path: | ||||||
|  |         if os.name == "posix": | ||||||
|  |             return Path("~/.config/PFERD/pferd.cfg").expanduser() | ||||||
|  |         elif os.name == "nt": | ||||||
|  |             return Path("~/AppData/Roaming/PFERD/pferd.cfg").expanduser() | ||||||
|  |         else: | ||||||
|  |             return Path("~/.pferd.cfg").expanduser() | ||||||
|  |  | ||||||
|  |     def __init__(self, parser: ConfigParser): | ||||||
|  |         self._parser = parser | ||||||
|  |         self._default_section = DefaultSection(parser[parser.default_section]) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def default_section(self) -> DefaultSection: | ||||||
|  |         return self._default_section | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: | ||||||
|  |         """ | ||||||
|  |         May throw a ConfigLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if path: | ||||||
|  |             log.explain("Path specified on CLI") | ||||||
|  |         else: | ||||||
|  |             log.explain("Using default path") | ||||||
|  |             path = Config._default_path() | ||||||
|  |         log.explain(f"Loading {fmt_real_path(path)}") | ||||||
|  |  | ||||||
|  |         # Using config.read_file instead of config.read because config.read | ||||||
|  |         # would just ignore a missing file and carry on. | ||||||
|  |         try: | ||||||
|  |             with open(path, encoding="utf-8") as f: | ||||||
|  |                 parser.read_file(f, source=str(path)) | ||||||
|  |         except FileNotFoundError: | ||||||
|  |             raise ConfigLoadError(path, "File does not exist") | ||||||
|  |         except IsADirectoryError: | ||||||
|  |             raise ConfigLoadError(path, "That's a directory, not a file") | ||||||
|  |         except PermissionError: | ||||||
|  |             raise ConfigLoadError(path, "Insufficient permissions") | ||||||
|  |         except UnicodeDecodeError: | ||||||
|  |             raise ConfigLoadError(path, "File is not encoded using UTF-8") | ||||||
|  |  | ||||||
|  |     def dump(self, path: Optional[Path] = None) -> None: | ||||||
|  |         """ | ||||||
|  |         May throw a ConfigDumpError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if path: | ||||||
|  |             log.explain("Using custom path") | ||||||
|  |         else: | ||||||
|  |             log.explain("Using default path") | ||||||
|  |             path = self._default_path() | ||||||
|  |  | ||||||
|  |         log.explain(f"Dumping to {fmt_real_path(path)}") | ||||||
|  |         log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             path.parent.mkdir(parents=True, exist_ok=True) | ||||||
|  |         except PermissionError: | ||||||
|  |             raise ConfigDumpError(path, "Could not create parent directory") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             # Ensuring we don't accidentally overwrite any existing files by | ||||||
|  |             # always asking before overwriting a file. | ||||||
|  |             try: | ||||||
|  |                 # x = open for exclusive creation, failing if the file already | ||||||
|  |                 # exists | ||||||
|  |                 with open(path, "x", encoding="utf-8") as f: | ||||||
|  |                     self._parser.write(f) | ||||||
|  |             except FileExistsError: | ||||||
|  |                 print("That file already exists.") | ||||||
|  |                 if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): | ||||||
|  |                     with open(path, "w", encoding="utf-8") as f: | ||||||
|  |                         self._parser.write(f) | ||||||
|  |                 else: | ||||||
|  |                     raise ConfigDumpError(path, "File already exists") | ||||||
|  |         except IsADirectoryError: | ||||||
|  |             raise ConfigDumpError(path, "That's a directory, not a file") | ||||||
|  |         except PermissionError: | ||||||
|  |             raise ConfigDumpError(path, "Insufficient permissions") | ||||||
|  |  | ||||||
|  |     def dump_to_stdout(self) -> None: | ||||||
|  |         self._parser.write(sys.stdout) | ||||||
|  |  | ||||||
|  |     def crawl_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||||
|  |         result = [] | ||||||
|  |         for name, proxy in self._parser.items(): | ||||||
|  |             if name.startswith("crawl:"): | ||||||
|  |                 result.append((name, proxy)) | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     def auth_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||||
|  |         result = [] | ||||||
|  |         for name, proxy in self._parser.items(): | ||||||
|  |             if name.startswith("auth:"): | ||||||
|  |                 result.append((name, proxy)) | ||||||
|  |  | ||||||
|  |         return result | ||||||
| @@ -1,69 +0,0 @@ | |||||||
| """A helper for requests cookies.""" |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| from http.cookiejar import LoadError, LWPCookieJar |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class CookieJar: |  | ||||||
|     """A cookie jar that can be persisted.""" |  | ||||||
|  |  | ||||||
|     def __init__(self, cookie_file: Optional[Path] = None) -> None: |  | ||||||
|         """Create a new cookie jar at the given path. |  | ||||||
|  |  | ||||||
|         If the path is None, the cookies will not be persisted. |  | ||||||
|         """ |  | ||||||
|         self._cookies: LWPCookieJar |  | ||||||
|         if cookie_file is None: |  | ||||||
|             self._cookies = LWPCookieJar() |  | ||||||
|         else: |  | ||||||
|             self._cookies = LWPCookieJar(str(cookie_file.resolve())) |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def cookies(self) -> LWPCookieJar: |  | ||||||
|         """Return the requests cookie jar.""" |  | ||||||
|         return self._cookies |  | ||||||
|  |  | ||||||
|     def load_cookies(self) -> None: |  | ||||||
|         """Load all cookies from the file given in the constructor.""" |  | ||||||
|         if self._cookies.filename is None: |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         try: |  | ||||||
|             LOGGER.info("Loading old cookies from %s", self._cookies.filename) |  | ||||||
|             self._cookies.load(ignore_discard=True) |  | ||||||
|         except (FileNotFoundError, LoadError): |  | ||||||
|             LOGGER.warning( |  | ||||||
|                 "No valid cookie file found at %s, continuing with no cookies", |  | ||||||
|                 self._cookies.filename |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     def save_cookies(self, reason: Optional[str] = None) -> None: |  | ||||||
|         """Save the cookies in the file given in the constructor.""" |  | ||||||
|         if self._cookies.filename is None: |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         if reason is None: |  | ||||||
|             LOGGER.info("Saving cookies") |  | ||||||
|         else: |  | ||||||
|             LOGGER.info("Saving cookies (%s)", reason) |  | ||||||
|  |  | ||||||
|         # TODO figure out why ignore_discard is set |  | ||||||
|         # TODO possibly catch a few more exceptions |  | ||||||
|         self._cookies.save(ignore_discard=True) |  | ||||||
|  |  | ||||||
|     def create_session(self) -> requests.Session: |  | ||||||
|         """Create a new session using the cookie jar.""" |  | ||||||
|         sess = requests.Session() |  | ||||||
|  |  | ||||||
|         # From the request docs: "All requests code should work out of the box |  | ||||||
|         # with externally provided instances of CookieJar, e.g. LWPCookieJar |  | ||||||
|         # and FileCookieJar." |  | ||||||
|         sess.cookies = self.cookies  # type: ignore |  | ||||||
|  |  | ||||||
|         return sess |  | ||||||
							
								
								
									
										27
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | from configparser import SectionProxy | ||||||
|  | from typing import Callable, Dict | ||||||
|  |  | ||||||
|  | from ..auth import Authenticator | ||||||
|  | from ..config import Config | ||||||
|  | from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401 | ||||||
|  | from .ilias import IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||||
|  | from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection | ||||||
|  | from .local_crawler import LocalCrawler, LocalCrawlerSection | ||||||
|  |  | ||||||
|  | CrawlerConstructor = Callable[[ | ||||||
|  |     str,                       # Name (without the "crawl:" prefix) | ||||||
|  |     SectionProxy,              # Crawler's section of global config | ||||||
|  |     Config,                    # Global config | ||||||
|  |     Dict[str, Authenticator],  # Loaded authenticators by name | ||||||
|  | ], Crawler] | ||||||
|  |  | ||||||
|  | CRAWLERS: Dict[str, CrawlerConstructor] = { | ||||||
|  |     "local": lambda n, s, c, a: | ||||||
|  |         LocalCrawler(n, LocalCrawlerSection(s), c), | ||||||
|  |     "ilias-web": lambda n, s, c, a: | ||||||
|  |         IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a), | ||||||
|  |     "kit-ilias-web": lambda n, s, c, a: | ||||||
|  |         KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), | ||||||
|  |     "kit-ipd": lambda n, s, c, a: | ||||||
|  |         KitIpdCrawler(n, KitIpdCrawlerSection(s), c), | ||||||
|  | } | ||||||
							
								
								
									
										409
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										409
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,409 @@ | |||||||
|  | import asyncio | ||||||
|  | import os | ||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from collections.abc import Awaitable, Coroutine | ||||||
|  | from datetime import datetime | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar | ||||||
|  |  | ||||||
|  | from ..auth import Authenticator | ||||||
|  | from ..config import Config, Section | ||||||
|  | from ..deduplicator import Deduplicator | ||||||
|  | from ..limiter import Limiter | ||||||
|  | from ..logging import ProgressBar, log | ||||||
|  | from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload | ||||||
|  | from ..report import MarkConflictError, MarkDuplicateError, Report | ||||||
|  | from ..transformer import Transformer | ||||||
|  | from ..utils import ReusableAsyncContextManager, fmt_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlWarning(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def noncritical(f: Wrapped) -> Wrapped: | ||||||
|  |     """ | ||||||
|  |     Catches and logs a few noncritical exceptions occurring during the function | ||||||
|  |     call, mainly CrawlWarning. | ||||||
|  |  | ||||||
|  |     If any exception occurs during the function call, the crawler's error_free | ||||||
|  |     variable is set to False. This includes noncritical exceptions. | ||||||
|  |  | ||||||
|  |     Warning: Must only be applied to member functions of the Crawler class! | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def wrapper(*args: Any, **kwargs: Any) -> None: | ||||||
|  |         if not (args and isinstance(args[0], Crawler)): | ||||||
|  |             raise RuntimeError("@noncritical must only applied to Crawler methods") | ||||||
|  |  | ||||||
|  |         crawler = args[0] | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             f(*args, **kwargs) | ||||||
|  |         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||||
|  |             crawler.report.add_warning(str(e)) | ||||||
|  |             log.warn(str(e)) | ||||||
|  |             crawler.error_free = False | ||||||
|  |         except Exception as e: | ||||||
|  |             crawler.error_free = False | ||||||
|  |             crawler.report.add_error(str(e)) | ||||||
|  |             raise | ||||||
|  |  | ||||||
|  |     return wrapper  # type: ignore | ||||||
|  |  | ||||||
|  |  | ||||||
|  | AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def anoncritical(f: AWrapped) -> AWrapped: | ||||||
|  |     """ | ||||||
|  |     An async version of @noncritical. | ||||||
|  |  | ||||||
|  |     Catches and logs a few noncritical exceptions occurring during the function | ||||||
|  |     call, mainly CrawlWarning. | ||||||
|  |  | ||||||
|  |     If any exception occurs during the function call, the crawler's error_free | ||||||
|  |     variable is set to False. This includes noncritical exceptions. | ||||||
|  |  | ||||||
|  |     Warning: Must only be applied to member functions of the Crawler class! | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||||
|  |         if not (args and isinstance(args[0], Crawler)): | ||||||
|  |             raise RuntimeError("@anoncritical must only applied to Crawler methods") | ||||||
|  |  | ||||||
|  |         crawler = args[0] | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             return await f(*args, **kwargs) | ||||||
|  |         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||||
|  |             log.warn(str(e)) | ||||||
|  |             crawler.error_free = False | ||||||
|  |             crawler.report.add_warning(str(e)) | ||||||
|  |         except Exception as e: | ||||||
|  |             crawler.error_free = False | ||||||
|  |             crawler.report.add_error(str(e)) | ||||||
|  |             raise | ||||||
|  |  | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return wrapper  # type: ignore | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlToken(ReusableAsyncContextManager[ProgressBar]): | ||||||
|  |     def __init__(self, limiter: Limiter, path: PurePath): | ||||||
|  |         super().__init__() | ||||||
|  |  | ||||||
|  |         self._limiter = limiter | ||||||
|  |         self._path = path | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def path(self) -> PurePath: | ||||||
|  |         return self._path | ||||||
|  |  | ||||||
|  |     async def _on_aenter(self) -> ProgressBar: | ||||||
|  |         self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path))) | ||||||
|  |         await self._stack.enter_async_context(self._limiter.limit_crawl()) | ||||||
|  |         bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path))) | ||||||
|  |  | ||||||
|  |         return bar | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): | ||||||
|  |     def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath): | ||||||
|  |         super().__init__() | ||||||
|  |  | ||||||
|  |         self._limiter = limiter | ||||||
|  |         self._fs_token = fs_token | ||||||
|  |         self._path = path | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def path(self) -> PurePath: | ||||||
|  |         return self._path | ||||||
|  |  | ||||||
|  |     async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: | ||||||
|  |         await self._stack.enter_async_context(self._limiter.limit_download()) | ||||||
|  |         sink = await self._stack.enter_async_context(self._fs_token) | ||||||
|  |         # The "Downloaded ..." message is printed in the output dir, not here | ||||||
|  |         bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading", | ||||||
|  |                                                          fmt_path(self._path))) | ||||||
|  |  | ||||||
|  |         return bar, sink | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlerSection(Section): | ||||||
|  |     def type(self) -> str: | ||||||
|  |         value = self.s.get("type") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("type") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def skip(self) -> bool: | ||||||
|  |         return self.s.getboolean("skip", fallback=False) | ||||||
|  |  | ||||||
|  |     def output_dir(self, name: str) -> Path: | ||||||
|  |         name = name.removeprefix("crawl:") | ||||||
|  |         return Path(self.s.get("output_dir", name)).expanduser() | ||||||
|  |  | ||||||
|  |     def redownload(self) -> Redownload: | ||||||
|  |         value = self.s.get("redownload", "never-smart") | ||||||
|  |         try: | ||||||
|  |             return Redownload.from_string(value) | ||||||
|  |         except ValueError as e: | ||||||
|  |             self.invalid_value( | ||||||
|  |                 "redownload", | ||||||
|  |                 value, | ||||||
|  |                 str(e).capitalize(), | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def on_conflict(self) -> OnConflict: | ||||||
|  |         value = self.s.get("on_conflict", "prompt") | ||||||
|  |         try: | ||||||
|  |             return OnConflict.from_string(value) | ||||||
|  |         except ValueError as e: | ||||||
|  |             self.invalid_value( | ||||||
|  |                 "on_conflict", | ||||||
|  |                 value, | ||||||
|  |                 str(e).capitalize(), | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def transform(self) -> str: | ||||||
|  |         return self.s.get("transform", "") | ||||||
|  |  | ||||||
|  |     def tasks(self) -> int: | ||||||
|  |         value = self.s.getint("tasks", fallback=1) | ||||||
|  |         if value <= 0: | ||||||
|  |             self.invalid_value("tasks", value, "Must be greater than 0") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def downloads(self) -> int: | ||||||
|  |         tasks = self.tasks() | ||||||
|  |         value = self.s.getint("downloads", fallback=None) | ||||||
|  |         if value is None: | ||||||
|  |             return tasks | ||||||
|  |         if value <= 0: | ||||||
|  |             self.invalid_value("downloads", value, "Must be greater than 0") | ||||||
|  |         if value > tasks: | ||||||
|  |             self.invalid_value("downloads", value, "Must not be greater than tasks") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def task_delay(self) -> float: | ||||||
|  |         value = self.s.getfloat("task_delay", fallback=0.0) | ||||||
|  |         if value < 0: | ||||||
|  |             self.invalid_value("task_delay", value, "Must not be negative") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def windows_paths(self) -> bool: | ||||||
|  |         on_windows = os.name == "nt" | ||||||
|  |         return self.s.getboolean("windows_paths", fallback=on_windows) | ||||||
|  |  | ||||||
|  |     def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: | ||||||
|  |         value = self.s.get("auth") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("auth") | ||||||
|  |         auth = authenticators.get(value) | ||||||
|  |         if auth is None: | ||||||
|  |             self.invalid_value("auth", value, "No such auth section exists") | ||||||
|  |         return auth | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Crawler(ABC): | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: CrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |     ) -> None: | ||||||
|  |         """ | ||||||
|  |         Initialize a crawler from its name and its section in the config file. | ||||||
|  |  | ||||||
|  |         If you are writing your own constructor for your own crawler, make sure | ||||||
|  |         to call this constructor first (via super().__init__). | ||||||
|  |  | ||||||
|  |         May throw a CrawlerLoadException. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.name = name | ||||||
|  |         self.error_free = True | ||||||
|  |  | ||||||
|  |         self._limiter = Limiter( | ||||||
|  |             task_limit=section.tasks(), | ||||||
|  |             download_limit=section.downloads(), | ||||||
|  |             task_delay=section.task_delay(), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         self._deduplicator = Deduplicator(section.windows_paths()) | ||||||
|  |         self._transformer = Transformer(section.transform()) | ||||||
|  |  | ||||||
|  |         self._output_dir = OutputDirectory( | ||||||
|  |             config.default_section.working_dir() / section.output_dir(name), | ||||||
|  |             section.redownload(), | ||||||
|  |             section.on_conflict(), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def report(self) -> Report: | ||||||
|  |         return self._output_dir.report | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def prev_report(self) -> Optional[Report]: | ||||||
|  |         return self._output_dir.prev_report | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def output_dir(self) -> OutputDirectory: | ||||||
|  |         return self._output_dir | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: | ||||||
|  |         """ | ||||||
|  |         Similar to asyncio.gather. However, in the case of an exception, all | ||||||
|  |         still running tasks are cancelled and the exception is rethrown. | ||||||
|  |  | ||||||
|  |         This should always be preferred over asyncio.gather in crawler code so | ||||||
|  |         that an exception like CrawlError may actually stop the crawler. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         tasks = [asyncio.ensure_future(aw) for aw in awaitables] | ||||||
|  |         result = asyncio.gather(*tasks) | ||||||
|  |         try: | ||||||
|  |             return await result | ||||||
|  |         except:  # noqa: E722 | ||||||
|  |             for task in tasks: | ||||||
|  |                 task.cancel() | ||||||
|  |             raise | ||||||
|  |  | ||||||
|  |     async def crawl(self, path: PurePath) -> Optional[CrawlToken]: | ||||||
|  |         log.explain_topic(f"Decision: Crawl {fmt_path(path)}") | ||||||
|  |         path = self._deduplicator.mark(path) | ||||||
|  |         self._output_dir.report.found(path) | ||||||
|  |  | ||||||
|  |         if self._transformer.transform(path) is None: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         log.explain("Answer: Yes") | ||||||
|  |         return CrawlToken(self._limiter, path) | ||||||
|  |  | ||||||
|  |     def should_try_download( | ||||||
|  |             self, | ||||||
|  |             path: PurePath, | ||||||
|  |             *, | ||||||
|  |             etag_differs: Optional[bool] = None, | ||||||
|  |             mtime: Optional[datetime] = None, | ||||||
|  |             redownload: Optional[Redownload] = None, | ||||||
|  |             on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> bool: | ||||||
|  |         log.explain_topic(f"Decision: Should Download {fmt_path(path)}") | ||||||
|  |  | ||||||
|  |         if self._transformer.transform(path) is None: | ||||||
|  |             log.explain("Answer: No (ignored)") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         should_download = self._output_dir.should_try_download( | ||||||
|  |             path, | ||||||
|  |             etag_differs=etag_differs, | ||||||
|  |             mtime=mtime, | ||||||
|  |             redownload=redownload, | ||||||
|  |             on_conflict=on_conflict | ||||||
|  |         ) | ||||||
|  |         if should_download: | ||||||
|  |             log.explain("Answer: Yes") | ||||||
|  |             return True | ||||||
|  |         else: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |     async def download( | ||||||
|  |             self, | ||||||
|  |             path: PurePath, | ||||||
|  |             *, | ||||||
|  |             etag_differs: Optional[bool] = None, | ||||||
|  |             mtime: Optional[datetime] = None, | ||||||
|  |             redownload: Optional[Redownload] = None, | ||||||
|  |             on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> Optional[DownloadToken]: | ||||||
|  |         log.explain_topic(f"Decision: Download {fmt_path(path)}") | ||||||
|  |         path = self._deduplicator.mark(path) | ||||||
|  |         self._output_dir.report.found(path) | ||||||
|  |  | ||||||
|  |         transformed_path = self._transformer.transform(path) | ||||||
|  |         if transformed_path is None: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         fs_token = await self._output_dir.download( | ||||||
|  |             path, | ||||||
|  |             transformed_path, | ||||||
|  |             etag_differs=etag_differs, | ||||||
|  |             mtime=mtime, | ||||||
|  |             redownload=redownload, | ||||||
|  |             on_conflict=on_conflict | ||||||
|  |         ) | ||||||
|  |         if fs_token is None: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         log.explain("Answer: Yes") | ||||||
|  |         return DownloadToken(self._limiter, fs_token, path) | ||||||
|  |  | ||||||
|  |     async def _cleanup(self) -> None: | ||||||
|  |         log.explain_topic("Decision: Clean up files") | ||||||
|  |         if self.error_free: | ||||||
|  |             log.explain("No warnings or errors occurred during this run") | ||||||
|  |             log.explain("Answer: Yes") | ||||||
|  |             await self._output_dir.cleanup() | ||||||
|  |         else: | ||||||
|  |             log.explain("Warnings or errors occurred during this run") | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |  | ||||||
|  |     @anoncritical | ||||||
|  |     async def run(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Start the crawling process. Call this function if you want to use a | ||||||
|  |         crawler. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         with log.show_progress(): | ||||||
|  |             self._output_dir.prepare() | ||||||
|  |             self._output_dir.load_prev_report() | ||||||
|  |             await self._run() | ||||||
|  |             await self._cleanup() | ||||||
|  |             self._output_dir.store_report() | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     async def _run(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Overwrite this function if you are writing a crawler. | ||||||
|  |  | ||||||
|  |         This function must not return before all crawling is complete. To crawl | ||||||
|  |         multiple things concurrently, asyncio.gather can be used. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     def debug_transforms(self) -> None: | ||||||
|  |         self._output_dir.load_prev_report() | ||||||
|  |  | ||||||
|  |         if not self.prev_report: | ||||||
|  |             log.warn("Couldn't find or load old report") | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         seen: Set[PurePath] = set() | ||||||
|  |         for known in sorted(self.prev_report.found_paths): | ||||||
|  |             looking_at = list(reversed(known.parents)) + [known] | ||||||
|  |             for path in looking_at: | ||||||
|  |                 if path in seen: | ||||||
|  |                     continue | ||||||
|  |  | ||||||
|  |                 log.explain_topic(f"Transforming {fmt_path(path)}") | ||||||
|  |                 self._transformer.transform(path) | ||||||
|  |                 seen.add(path) | ||||||
							
								
								
									
										281
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										281
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,281 @@ | |||||||
|  | import asyncio | ||||||
|  | import http.cookies | ||||||
|  | import ssl | ||||||
|  | from datetime import datetime | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Any, Dict, List, Optional, Tuple, cast | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  | import certifi | ||||||
|  | from aiohttp.client import ClientTimeout | ||||||
|  | from bs4 import Tag | ||||||
|  |  | ||||||
|  | from ..auth import Authenticator | ||||||
|  | from ..config import Config | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import fmt_real_path | ||||||
|  | from ..version import NAME, VERSION | ||||||
|  | from .crawler import Crawler, CrawlerSection | ||||||
|  |  | ||||||
|  | ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class HttpCrawlerSection(CrawlerSection): | ||||||
|  |     def http_timeout(self) -> float: | ||||||
|  |         return self.s.getfloat("http_timeout", fallback=30) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class HttpCrawler(Crawler): | ||||||
|  |     COOKIE_FILE = PurePath(".cookies") | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: HttpCrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |             shared_auth: Optional[Authenticator] = None, | ||||||
|  |     ) -> None: | ||||||
|  |         super().__init__(name, section, config) | ||||||
|  |  | ||||||
|  |         self._authentication_id = 0 | ||||||
|  |         self._authentication_lock = asyncio.Lock() | ||||||
|  |         self._request_count = 0 | ||||||
|  |         self._http_timeout = section.http_timeout() | ||||||
|  |  | ||||||
|  |         self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) | ||||||
|  |         self._shared_cookie_jar_paths: Optional[List[Path]] = None | ||||||
|  |         self._shared_auth = shared_auth | ||||||
|  |  | ||||||
|  |         self._output_dir.register_reserved(self.COOKIE_FILE) | ||||||
|  |  | ||||||
|  |     async def _current_auth_id(self) -> int: | ||||||
|  |         """ | ||||||
|  |         Returns the id for the current authentication, i.e. an identifier for the last | ||||||
|  |         successful call to [authenticate]. | ||||||
|  |  | ||||||
|  |         This method must be called before any request that might authenticate is made, so the | ||||||
|  |         HttpCrawler can properly track when [authenticate] can return early and when actual | ||||||
|  |         authentication is necessary. | ||||||
|  |         """ | ||||||
|  |         # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. | ||||||
|  |         # This should reduce the amount of requests we make: If an authentication is in progress | ||||||
|  |         # all future requests wait for authentication to complete. | ||||||
|  |         async with self._authentication_lock: | ||||||
|  |             self._request_count += 1 | ||||||
|  |             return self._authentication_id | ||||||
|  |  | ||||||
|  |     async def authenticate(self, caller_auth_id: int) -> None: | ||||||
|  |         """ | ||||||
|  |         Starts the authentication process. The main work is offloaded to _authenticate, which | ||||||
|  |         you should overwrite in a subclass if needed. This method should *NOT* be overwritten. | ||||||
|  |  | ||||||
|  |         The [caller_auth_id] should be the result of a [_current_auth_id] call made *before* | ||||||
|  |         the request was made. This ensures that authentication is not performed needlessly. | ||||||
|  |         """ | ||||||
|  |         async with self._authentication_lock: | ||||||
|  |             log.explain_topic("Authenticating") | ||||||
|  |             # Another thread successfully called authenticate in-between | ||||||
|  |             # We do not want to perform auth again, so we return here. We can | ||||||
|  |             # assume the other thread suceeded as authenticate will throw an error | ||||||
|  |             # if it failed and aborts the crawl process. | ||||||
|  |             if caller_auth_id != self._authentication_id: | ||||||
|  |                 log.explain( | ||||||
|  |                     "Authentication skipped due to auth id mismatch." | ||||||
|  |                     "A previous authentication beat us to the race." | ||||||
|  |                 ) | ||||||
|  |                 return | ||||||
|  |             log.explain("Calling crawler-specific authenticate") | ||||||
|  |             await self._authenticate() | ||||||
|  |             self._authentication_id += 1 | ||||||
|  |             # Saving the cookies after the first auth ensures we won't need to re-authenticate | ||||||
|  |             # on the next run, should this one be aborted or crash | ||||||
|  |             self._save_cookies() | ||||||
|  |  | ||||||
|  |     async def _authenticate(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Performs authentication. This method must only return normally if authentication suceeded. | ||||||
|  |         In all other cases it must either retry internally or throw a terminal exception. | ||||||
|  |         """ | ||||||
|  |         raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") | ||||||
|  |  | ||||||
|  |     def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None: | ||||||
|  |         if not self._shared_auth: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         if self._shared_auth in shared: | ||||||
|  |             self._shared_cookie_jar_paths = shared[self._shared_auth] | ||||||
|  |         else: | ||||||
|  |             self._shared_cookie_jar_paths = [] | ||||||
|  |             shared[self._shared_auth] = self._shared_cookie_jar_paths | ||||||
|  |  | ||||||
|  |         self._shared_cookie_jar_paths.append(self._cookie_jar_path) | ||||||
|  |  | ||||||
|  |     def _load_cookies_from_file(self, path: Path) -> None: | ||||||
|  |         jar: Any = http.cookies.SimpleCookie() | ||||||
|  |         with open(path, encoding="utf-8") as f: | ||||||
|  |             for i, line in enumerate(f): | ||||||
|  |                 # Names of headers are case insensitive | ||||||
|  |                 if line[:11].lower() == "set-cookie:": | ||||||
|  |                     jar.load(line[11:]) | ||||||
|  |                 else: | ||||||
|  |                     log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it") | ||||||
|  |         self._cookie_jar.update_cookies(jar) | ||||||
|  |  | ||||||
|  |     def _save_cookies_to_file(self, path: Path) -> None: | ||||||
|  |         jar: Any = http.cookies.SimpleCookie() | ||||||
|  |         for morsel in self._cookie_jar: | ||||||
|  |             jar[morsel.key] = morsel | ||||||
|  |         with open(path, "w", encoding="utf-8") as f: | ||||||
|  |             f.write(jar.output(sep="\n")) | ||||||
|  |             f.write("\n")  # A trailing newline is just common courtesy | ||||||
|  |  | ||||||
|  |     def _load_cookies(self) -> None: | ||||||
|  |         log.explain_topic("Loading cookies") | ||||||
|  |  | ||||||
|  |         cookie_jar_path: Optional[Path] = None | ||||||
|  |  | ||||||
|  |         if self._shared_cookie_jar_paths is None: | ||||||
|  |             log.explain("Not sharing any cookies") | ||||||
|  |             cookie_jar_path = self._cookie_jar_path | ||||||
|  |         else: | ||||||
|  |             log.explain("Sharing cookies") | ||||||
|  |             max_mtime: Optional[float] = None | ||||||
|  |             for path in self._shared_cookie_jar_paths: | ||||||
|  |                 if not path.is_file(): | ||||||
|  |                     log.explain(f"{fmt_real_path(path)} is not a file") | ||||||
|  |                     continue | ||||||
|  |                 mtime = path.stat().st_mtime | ||||||
|  |                 if max_mtime is None or mtime > max_mtime: | ||||||
|  |                     log.explain(f"{fmt_real_path(path)} has newest mtime so far") | ||||||
|  |                     max_mtime = mtime | ||||||
|  |                     cookie_jar_path = path | ||||||
|  |                 else: | ||||||
|  |                     log.explain(f"{fmt_real_path(path)} has older mtime") | ||||||
|  |  | ||||||
|  |         if cookie_jar_path is None: | ||||||
|  |             log.explain("Couldn't find a suitable cookie file") | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") | ||||||
|  |         try: | ||||||
|  |             self._load_cookies_from_file(cookie_jar_path) | ||||||
|  |         except Exception as e: | ||||||
|  |             log.explain("Failed to load cookies") | ||||||
|  |             log.explain(str(e)) | ||||||
|  |  | ||||||
|  |     def _save_cookies(self) -> None: | ||||||
|  |         log.explain_topic("Saving cookies") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||||
|  |             self._save_cookies_to_file(self._cookie_jar_path) | ||||||
|  |         except Exception as e: | ||||||
|  |             log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||||
|  |             log.warn(str(e)) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def get_folder_structure_from_heading_hierarchy(file_link: Tag, drop_h1: bool = False) -> PurePath: | ||||||
|  |         """ | ||||||
|  |         Retrieves the hierarchy of headings associated with the give file link and constructs a folder | ||||||
|  |         structure from them. | ||||||
|  |  | ||||||
|  |         <h1> level headings usually only appear once and serve as the page title, so they would introduce | ||||||
|  |         redundant nesting. To avoid this, <h1> headings are ignored via the drop_h1 parameter. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         def find_associated_headings(tag: Tag, level: int) -> PurePath: | ||||||
|  |             if level == 0 or (level == 1 and drop_h1): | ||||||
|  |                 return PurePath() | ||||||
|  |  | ||||||
|  |             level_heading = cast(Optional[Tag], tag.find_previous(name=f"h{level}")) | ||||||
|  |  | ||||||
|  |             if level_heading is None: | ||||||
|  |                 return find_associated_headings(tag, level - 1) | ||||||
|  |  | ||||||
|  |             folder_name = level_heading.get_text().strip() | ||||||
|  |             return find_associated_headings(level_heading, level - 1) / folder_name | ||||||
|  |  | ||||||
|  |         # start at level <h3> because paragraph-level headings are usually too granular for folder names | ||||||
|  |         return find_associated_headings(file_link, 3) | ||||||
|  |  | ||||||
|  |     def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: | ||||||
|  |         """ | ||||||
|  |         If available, retrieves the entity tag for a given path which was stored in the previous report. | ||||||
|  |         """ | ||||||
|  |         if not self._output_dir.prev_report: | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} | ||||||
|  |         return etags.get(str(path)) | ||||||
|  |  | ||||||
|  |     def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds an entity tag for a given path to the report's custom values. | ||||||
|  |         """ | ||||||
|  |         if not etag: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} | ||||||
|  |         etags[str(path)] = etag | ||||||
|  |         self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) | ||||||
|  |  | ||||||
|  |     async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]: | ||||||
|  |         """ | ||||||
|  |         Requests the ETag and Last-Modified headers of a resource via a HEAD request. | ||||||
|  |         If no entity tag / modification date can be obtained, the according value will be None. | ||||||
|  |         """ | ||||||
|  |         try: | ||||||
|  |             async with self.session.head(resource_url) as resp: | ||||||
|  |                 if resp.status != 200: | ||||||
|  |                     return None, None | ||||||
|  |  | ||||||
|  |                 etag_header = resp.headers.get("ETag") | ||||||
|  |                 last_modified_header = resp.headers.get("Last-Modified") | ||||||
|  |                 last_modified = None | ||||||
|  |  | ||||||
|  |                 if last_modified_header: | ||||||
|  |                     try: | ||||||
|  |                         # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives | ||||||
|  |                         datetime_format = "%a, %d %b %Y %H:%M:%S GMT" | ||||||
|  |                         last_modified = datetime.strptime(last_modified_header, datetime_format) | ||||||
|  |                     except ValueError: | ||||||
|  |                         # last_modified remains None | ||||||
|  |                         pass | ||||||
|  |  | ||||||
|  |                 return etag_header, last_modified | ||||||
|  |         except aiohttp.ClientError: | ||||||
|  |             return None, None | ||||||
|  |  | ||||||
|  |     async def run(self) -> None: | ||||||
|  |         self._request_count = 0 | ||||||
|  |         self._cookie_jar = aiohttp.CookieJar() | ||||||
|  |         self._load_cookies() | ||||||
|  |  | ||||||
|  |         async with aiohttp.ClientSession( | ||||||
|  |                 headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||||
|  |                 cookie_jar=self._cookie_jar, | ||||||
|  |                 connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||||
|  |                 timeout=ClientTimeout( | ||||||
|  |                     # 30 minutes. No download in the history of downloads was longer than 30 minutes. | ||||||
|  |                     # This is enough to transfer a 600 MB file over a 3 Mib/s connection. | ||||||
|  |                     # Allowing an arbitrary value could be annoying for overnight batch jobs | ||||||
|  |                     total=15 * 60, | ||||||
|  |                     connect=self._http_timeout, | ||||||
|  |                     sock_connect=self._http_timeout, | ||||||
|  |                     sock_read=self._http_timeout, | ||||||
|  |                 ), | ||||||
|  |                 # See https://github.com/aio-libs/aiohttp/issues/6626 | ||||||
|  |                 # Without this aiohttp will mangle the redirect header from Shibboleth, invalidating the | ||||||
|  |                 # passed signature. Shibboleth will not accept the broken signature and authentication will | ||||||
|  |                 # fail. | ||||||
|  |                 requote_redirect_url=False | ||||||
|  |         ) as session: | ||||||
|  |             self.session = session | ||||||
|  |             try: | ||||||
|  |                 await super().run() | ||||||
|  |             finally: | ||||||
|  |                 del self.session | ||||||
|  |         log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") | ||||||
|  |  | ||||||
|  |         # They are saved in authenticate, but a final save won't hurt | ||||||
|  |         self._save_cookies() | ||||||
							
								
								
									
										9
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | |||||||
|  | from .kit_ilias_web_crawler import (IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, | ||||||
|  |                                     KitIliasWebCrawlerSection) | ||||||
|  |  | ||||||
|  | __all__ = [ | ||||||
|  |     "IliasWebCrawler", | ||||||
|  |     "IliasWebCrawlerSection", | ||||||
|  |     "KitIliasWebCrawler", | ||||||
|  |     "KitIliasWebCrawlerSection", | ||||||
|  | ] | ||||||
							
								
								
									
										40
									
								
								PFERD/crawl/ilias/async_helper.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								PFERD/crawl/ilias/async_helper.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | |||||||
|  | import asyncio | ||||||
|  | from typing import Any, Callable, Optional | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  |  | ||||||
|  | from ...logging import log | ||||||
|  | from ..crawler import AWrapped, CrawlError, CrawlWarning | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | ||||||
|  |     def decorator(f: AWrapped) -> AWrapped: | ||||||
|  |         async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||||
|  |             last_exception: Optional[BaseException] = None | ||||||
|  |             for round in range(attempts): | ||||||
|  |                 try: | ||||||
|  |                     return await f(*args, **kwargs) | ||||||
|  |                 except aiohttp.ContentTypeError:  # invalid content type | ||||||
|  |                     raise CrawlWarning("ILIAS returned an invalid content type") | ||||||
|  |                 except aiohttp.TooManyRedirects: | ||||||
|  |                     raise CrawlWarning("Got stuck in a redirect loop") | ||||||
|  |                 except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes | ||||||
|  |                     last_exception = e | ||||||
|  |                 except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc. | ||||||
|  |                     last_exception = e | ||||||
|  |                 except asyncio.exceptions.TimeoutError as e:  # explicit http timeouts in HttpCrawler | ||||||
|  |                     last_exception = e | ||||||
|  |                 log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") | ||||||
|  |                 log.explain(f"Last exception: {last_exception!r}") | ||||||
|  |  | ||||||
|  |             if last_exception: | ||||||
|  |                 message = f"Error in I/O Operation: {last_exception!r}" | ||||||
|  |                 if failure_is_error: | ||||||
|  |                     raise CrawlError(message) from last_exception | ||||||
|  |                 else: | ||||||
|  |                     raise CrawlWarning(message) from last_exception | ||||||
|  |             raise CrawlError("Impossible return in ilias _iorepeat") | ||||||
|  |  | ||||||
|  |         return wrapper  # type: ignore | ||||||
|  |  | ||||||
|  |     return decorator | ||||||
							
								
								
									
										292
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										292
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,292 @@ | |||||||
|  | from enum import Enum | ||||||
|  | from typing import Optional, cast | ||||||
|  |  | ||||||
|  | import bs4 | ||||||
|  |  | ||||||
|  | from PFERD.utils import soupify | ||||||
|  |  | ||||||
|  | _link_template_plain = "{{link}}" | ||||||
|  | _link_template_fancy = """ | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html lang="en"> | ||||||
|  |     <head> | ||||||
|  |         <meta charset="UTF-8"> | ||||||
|  |         <title>ILIAS - Link: {{name}}</title> | ||||||
|  |         <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> | ||||||
|  |     </head> | ||||||
|  |  | ||||||
|  |     <style> | ||||||
|  |     * { | ||||||
|  |         box-sizing: border-box; | ||||||
|  |     } | ||||||
|  |     .center-flex { | ||||||
|  |         display: flex; | ||||||
|  |         align-items: center; | ||||||
|  |         justify-content: center; | ||||||
|  |     } | ||||||
|  |     body { | ||||||
|  |         padding: 0; | ||||||
|  |         margin: 0; | ||||||
|  |         background-color: #f0f0f0; | ||||||
|  |         font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; | ||||||
|  |         height: 100vh; | ||||||
|  |     } | ||||||
|  |     .row { | ||||||
|  |         background-color: white; | ||||||
|  |         min-width: 500px; | ||||||
|  |         max-width: 90vw; | ||||||
|  |         display: flex; | ||||||
|  |         padding: 1em; | ||||||
|  |     } | ||||||
|  |     .logo { | ||||||
|  |         flex: 0 1; | ||||||
|  |         margin-right: 1em; | ||||||
|  |         fill: #009682; | ||||||
|  |     } | ||||||
|  |     .tile { | ||||||
|  |         flex: 1 0; | ||||||
|  |         display: flex; | ||||||
|  |         flex-direction: column; | ||||||
|  |         justify-content: center; | ||||||
|  |     } | ||||||
|  |     .top-row { | ||||||
|  |         padding-bottom: 5px; | ||||||
|  |         font-size: 15px; | ||||||
|  |     } | ||||||
|  |     a { | ||||||
|  |         color: #009682; | ||||||
|  |         text-decoration: none; | ||||||
|  |     } | ||||||
|  |     a:hover { | ||||||
|  |         text-decoration: underline; | ||||||
|  |     } | ||||||
|  |     .bottom-row { | ||||||
|  |         font-size: 13px; | ||||||
|  |     } | ||||||
|  |     .menu-button { | ||||||
|  |         border: 1px solid black; | ||||||
|  |         margin-left: 4em; | ||||||
|  |         width: 25px; | ||||||
|  |         height: 25px; | ||||||
|  |         flex: 0 0 25px; | ||||||
|  |         background-color: #b3e0da; | ||||||
|  |         font-size: 13px; | ||||||
|  |         color: #222; | ||||||
|  |     } | ||||||
|  |     </style> | ||||||
|  |     <body class="center-flex"> | ||||||
|  |         <div class="row"> | ||||||
|  |             <div class="logo center-flex"> | ||||||
|  |                 <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> | ||||||
|  |                     <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/> | ||||||
|  |                 </svg> | ||||||
|  |             </div> | ||||||
|  |             <div class="tile"> | ||||||
|  |                 <div class="top-row"> | ||||||
|  |                     <a href="{{link}}">{{name}}</a> | ||||||
|  |                 </div> | ||||||
|  |                 <div class="bottom-row">{{description}}</div> | ||||||
|  |             </div> | ||||||
|  |             <div class="menu-button center-flex"> ⯆ </div> | ||||||
|  |         </div> | ||||||
|  |     </body> | ||||||
|  | </html> | ||||||
|  | """.strip()  # noqa: E501 line too long | ||||||
|  |  | ||||||
|  | _link_template_internet_shortcut = """ | ||||||
|  | [InternetShortcut] | ||||||
|  | URL={{link}} | ||||||
|  | """.strip() | ||||||
|  |  | ||||||
|  | _learning_module_template = """ | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html lang="en"> | ||||||
|  |     <head> | ||||||
|  |         <meta charset="UTF-8"> | ||||||
|  |         <title>{{name}}</title> | ||||||
|  |     </head> | ||||||
|  |  | ||||||
|  |     <style> | ||||||
|  |     * { | ||||||
|  |         box-sizing: border-box; | ||||||
|  |     } | ||||||
|  |     .center-flex { | ||||||
|  |         display: flex; | ||||||
|  |         align-items: center; | ||||||
|  |         justify-content: center; | ||||||
|  |     } | ||||||
|  |     .nav { | ||||||
|  |         display: flex; | ||||||
|  |         justify-content: space-between; | ||||||
|  |     } | ||||||
|  |     </style> | ||||||
|  |     <body class="center-flex"> | ||||||
|  | {{body}} | ||||||
|  |     </body> | ||||||
|  | </html> | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | _forum_thread_template = """ | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html lang="en"> | ||||||
|  |     <head> | ||||||
|  |         <meta charset="UTF-8"> | ||||||
|  |         <title>ILIAS - Forum: {{name}}</title> | ||||||
|  |         <style> | ||||||
|  |             * { | ||||||
|  |                 box-sizing: border-box; | ||||||
|  |             } | ||||||
|  |             body { | ||||||
|  |                 font-family: 'Open Sans', Verdana, Arial, Helvetica, sans-serif; | ||||||
|  |                 padding: 8px; | ||||||
|  |             } | ||||||
|  |             ul, ol, p { | ||||||
|  |                 margin: 1.2em 0; | ||||||
|  |             } | ||||||
|  |             p { | ||||||
|  |                 margin-top: 8px; | ||||||
|  |                 margin-bottom: 8px; | ||||||
|  |             } | ||||||
|  |             a { | ||||||
|  |                 color: #00876c; | ||||||
|  |                 text-decoration: none; | ||||||
|  |                 cursor: pointer; | ||||||
|  |             } | ||||||
|  |             a:hover { | ||||||
|  |                 text-decoration: underline; | ||||||
|  |             } | ||||||
|  |             body > p:first-child > span:first-child { | ||||||
|  |                 font-size: 1.6em; | ||||||
|  |             } | ||||||
|  |             body > p:first-child > span:first-child ~ span.default { | ||||||
|  |                 display: inline-block; | ||||||
|  |                 font-size: 1.2em; | ||||||
|  |                 padding-bottom: 8px; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostContent { | ||||||
|  |                 margin-top: 8px; | ||||||
|  |                 max-width: 64em; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostContent > *:first-child { | ||||||
|  |                 margin-top: 0px; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostTitle { | ||||||
|  |                 margin-top: 24px; | ||||||
|  |                 color: #00876c; | ||||||
|  |                 font-weight: bold; | ||||||
|  |             } | ||||||
|  |             #ilFrmPostList { | ||||||
|  |                 list-style: none; | ||||||
|  |                 padding-left: 0; | ||||||
|  |             } | ||||||
|  |             li.ilFrmPostRow { | ||||||
|  |                 padding: 3px 0 3px 3px; | ||||||
|  |                 margin-bottom: 24px; | ||||||
|  |                 border-left: 6px solid #dddddd; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostRow > div { | ||||||
|  |                 display: flex; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostImage img { | ||||||
|  |                 margin: 0 !important; | ||||||
|  |                 padding: 6px 9px 9px 6px; | ||||||
|  |             } | ||||||
|  |             .ilUserIcon { | ||||||
|  |                 width: 115px; | ||||||
|  |             } | ||||||
|  |             .small { | ||||||
|  |                 text-decoration: none; | ||||||
|  |                 font-size: 0.75rem; | ||||||
|  |                 color: #6f6f6f; | ||||||
|  |             } | ||||||
|  |         </style> | ||||||
|  |     </head> | ||||||
|  |     <body> | ||||||
|  |     {{heading}} | ||||||
|  |     {{content}} | ||||||
|  |     </body> | ||||||
|  | </html> | ||||||
|  | """.strip()  # noqa: E501 line too long | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: | ||||||
|  |     # Seems to be comments, ignore those. | ||||||
|  |     for elem in body.select(".il-copg-mob-fullscreen-modal"): | ||||||
|  |         elem.decompose() | ||||||
|  |  | ||||||
|  |     nav_template = """ | ||||||
|  |         <div class="nav"> | ||||||
|  |             {{left}} | ||||||
|  |             {{right}} | ||||||
|  |         </div> | ||||||
|  |     """ | ||||||
|  |     if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): | ||||||
|  |         text = cast(bs4.Tag, body.select_one(".ilc_page_lnav_LeftNavigation")).get_text().strip() | ||||||
|  |         left = f'<a href="{prev}">{text}</a>' | ||||||
|  |     else: | ||||||
|  |         left = "<span></span>" | ||||||
|  |  | ||||||
|  |     if next and body.select_one(".ilc_page_rnav_RightNavigation"): | ||||||
|  |         text = cast(bs4.Tag, body.select_one(".ilc_page_rnav_RightNavigation")).get_text().strip() | ||||||
|  |         right = f'<a href="{next}">{text}</a>' | ||||||
|  |     else: | ||||||
|  |         right = "<span></span>" | ||||||
|  |  | ||||||
|  |     if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): | ||||||
|  |         top_nav.replace_with( | ||||||
|  |             soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): | ||||||
|  |         bot_nav.replace_with(soupify(nav_template.replace( | ||||||
|  |             "{{left}}", left).replace("{{right}}", right).encode()) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     body_str = cast(str, body.prettify()) | ||||||
|  |     return _learning_module_template.replace("{{body}}", body_str).replace("{{name}}", name) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Tag) -> str: | ||||||
|  |     if title := cast(Optional[bs4.Tag], heading.find(name="b")): | ||||||
|  |         title.wrap(bs4.Tag(name="a", attrs={"href": url})) | ||||||
|  |     return _forum_thread_template \ | ||||||
|  |         .replace("{{name}}", name) \ | ||||||
|  |         .replace("{{heading}}", cast(str, heading.prettify())) \ | ||||||
|  |         .replace("{{content}}", cast(str, content.prettify())) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Links(Enum): | ||||||
|  |     IGNORE = "ignore" | ||||||
|  |     PLAINTEXT = "plaintext" | ||||||
|  |     FANCY = "fancy" | ||||||
|  |     INTERNET_SHORTCUT = "internet-shortcut" | ||||||
|  |  | ||||||
|  |     def template(self) -> Optional[str]: | ||||||
|  |         if self == Links.FANCY: | ||||||
|  |             return _link_template_fancy | ||||||
|  |         elif self == Links.PLAINTEXT: | ||||||
|  |             return _link_template_plain | ||||||
|  |         elif self == Links.INTERNET_SHORTCUT: | ||||||
|  |             return _link_template_internet_shortcut | ||||||
|  |         elif self == Links.IGNORE: | ||||||
|  |             return None | ||||||
|  |         raise ValueError("Missing switch case") | ||||||
|  |  | ||||||
|  |     def extension(self) -> Optional[str]: | ||||||
|  |         if self == Links.FANCY: | ||||||
|  |             return ".html" | ||||||
|  |         elif self == Links.PLAINTEXT: | ||||||
|  |             return ".txt" | ||||||
|  |         elif self == Links.INTERNET_SHORTCUT: | ||||||
|  |             return ".url" | ||||||
|  |         elif self == Links.IGNORE: | ||||||
|  |             return None | ||||||
|  |         raise ValueError("Missing switch case") | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def from_string(string: str) -> "Links": | ||||||
|  |         try: | ||||||
|  |             return Links(string) | ||||||
|  |         except ValueError: | ||||||
|  |             raise ValueError("must be one of 'ignore', 'plaintext'," | ||||||
|  |                              " 'html', 'internet-shortcut'") | ||||||
							
								
								
									
										108
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,108 @@ | |||||||
|  | from typing import cast | ||||||
|  |  | ||||||
|  | from bs4 import BeautifulSoup, Comment, Tag | ||||||
|  |  | ||||||
|  | _STYLE_TAG_CONTENT = """ | ||||||
|  |     .ilc_text_block_Information { | ||||||
|  |       background-color: #f5f7fa; | ||||||
|  |     } | ||||||
|  |     div.ilc_text_block_Standard { | ||||||
|  |       margin-bottom: 10px; | ||||||
|  |       margin-top: 10px; | ||||||
|  |     } | ||||||
|  |     span.ilc_text_inline_Strong { | ||||||
|  |       font-weight: bold; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     .row-flex { | ||||||
|  |       display: flex; | ||||||
|  |     } | ||||||
|  |     .row-flex-wrap { | ||||||
|  |       flex-wrap: wrap; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     .accordion-head { | ||||||
|  |       background-color: #f5f7fa; | ||||||
|  |       padding: 0.5rem 0; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     h3 { | ||||||
|  |       margin-top: 0.5rem; | ||||||
|  |       margin-bottom: 1rem; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     br.visible-break { | ||||||
|  |       margin-bottom: 1rem; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     article { | ||||||
|  |       margin: 0.5rem 0; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     img { | ||||||
|  |         background-color: white; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     body { | ||||||
|  |       padding: 1em; | ||||||
|  |       grid-template-columns: 1fr min(60rem, 90%) 1fr; | ||||||
|  |       line-height: 1.2; | ||||||
|  |     } | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | _ARTICLE_WORTHY_CLASSES = [ | ||||||
|  |     "ilc_text_block_Information", | ||||||
|  |     "ilc_section_Attention", | ||||||
|  |     "ilc_section_Link", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: | ||||||
|  |     head = soup.new_tag("head") | ||||||
|  |     soup.insert(0, head) | ||||||
|  |     # Force UTF-8 encoding | ||||||
|  |     head.append(soup.new_tag("meta", charset="utf-8")) | ||||||
|  |  | ||||||
|  |     # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> | ||||||
|  |     head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css")) | ||||||
|  |  | ||||||
|  |     # Basic style tags for compat | ||||||
|  |     style: Tag = soup.new_tag("style") | ||||||
|  |     style.append(_STYLE_TAG_CONTENT) | ||||||
|  |     head.append(style) | ||||||
|  |  | ||||||
|  |     return soup | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def clean(soup: BeautifulSoup) -> BeautifulSoup: | ||||||
|  |     for block in cast(list[Tag], soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES)): | ||||||
|  |         block.name = "article" | ||||||
|  |  | ||||||
|  |     for block in cast(list[Tag], soup.find_all("h3")): | ||||||
|  |         block.name = "div" | ||||||
|  |  | ||||||
|  |     for block in cast(list[Tag], soup.find_all("h1")): | ||||||
|  |         block.name = "h3" | ||||||
|  |  | ||||||
|  |     for block in cast(list[Tag], soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap")): | ||||||
|  |         block.name = "h3" | ||||||
|  |         block["class"] += ["accordion-head"]  # type: ignore | ||||||
|  |  | ||||||
|  |     for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): | ||||||
|  |         children = list(dummy.children) | ||||||
|  |         if not children: | ||||||
|  |             dummy.decompose() | ||||||
|  |         if len(children) > 1: | ||||||
|  |             continue | ||||||
|  |         if isinstance(type(children[0]), Comment): | ||||||
|  |             dummy.decompose() | ||||||
|  |  | ||||||
|  |     # Delete video figures, as they can not be internalized anyway | ||||||
|  |     for video in soup.select(".ilc_media_cont_MediaContainerHighlighted .ilPageVideo"): | ||||||
|  |         if figure := video.find_parent("figure"): | ||||||
|  |             figure.decompose() | ||||||
|  |  | ||||||
|  |     for hrule_imposter in cast(list[Tag], soup.find_all(class_="ilc_section_Separator")): | ||||||
|  |         hrule_imposter.insert(0, soup.new_tag("hr")) | ||||||
|  |  | ||||||
|  |     return soup | ||||||
							
								
								
									
										1061
									
								
								PFERD/crawl/ilias/ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1061
									
								
								PFERD/crawl/ilias/ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										1609
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1609
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										37
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | |||||||
|  | from typing import Dict, Literal | ||||||
|  |  | ||||||
|  | from ...auth import Authenticator | ||||||
|  | from ...config import Config | ||||||
|  | from .ilias_web_crawler import IliasWebCrawler, IliasWebCrawlerSection | ||||||
|  | from .shibboleth_login import ShibbolethLogin | ||||||
|  |  | ||||||
|  | _ILIAS_URL = "https://ilias.studium.kit.edu" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitShibbolethBackgroundLoginSuccessful: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIliasWebCrawlerSection(IliasWebCrawlerSection): | ||||||
|  |     def base_url(self) -> str: | ||||||
|  |         return _ILIAS_URL | ||||||
|  |  | ||||||
|  |     def login(self) -> Literal["shibboleth"]: | ||||||
|  |         return "shibboleth" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIliasWebCrawler(IliasWebCrawler): | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         name: str, | ||||||
|  |         section: KitIliasWebCrawlerSection, | ||||||
|  |         config: Config, | ||||||
|  |         authenticators: Dict[str, Authenticator], | ||||||
|  |     ): | ||||||
|  |         super().__init__(name, section, config, authenticators) | ||||||
|  |  | ||||||
|  |         self._shibboleth_login = ShibbolethLogin( | ||||||
|  |             _ILIAS_URL, | ||||||
|  |             self._auth, | ||||||
|  |             section.tfa_auth(authenticators), | ||||||
|  |         ) | ||||||
							
								
								
									
										129
									
								
								PFERD/crawl/ilias/shibboleth_login.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										129
									
								
								PFERD/crawl/ilias/shibboleth_login.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,129 @@ | |||||||
|  | from typing import Any, Optional, cast | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  | import yarl | ||||||
|  | from bs4 import BeautifulSoup, Tag | ||||||
|  |  | ||||||
|  | from ...auth import Authenticator, TfaAuthenticator | ||||||
|  | from ...logging import log | ||||||
|  | from ...utils import soupify | ||||||
|  | from ..crawler import CrawlError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ShibbolethLogin: | ||||||
|  |     """ | ||||||
|  |     Login via shibboleth system. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |         self, ilias_url: str, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator] | ||||||
|  |     ) -> None: | ||||||
|  |         self._ilias_url = ilias_url | ||||||
|  |         self._auth = authenticator | ||||||
|  |         self._tfa_auth = tfa_authenticator | ||||||
|  |  | ||||||
|  |     async def login(self, sess: aiohttp.ClientSession) -> None: | ||||||
|  |         """ | ||||||
|  |         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||||
|  |         cookies it receieves. | ||||||
|  |  | ||||||
|  |         This function should only be called whenever it is detected that you're | ||||||
|  |         not logged in. The cookies obtained should be good for a few minutes, | ||||||
|  |         maybe even an hour or two. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||||
|  |         # https://ilias.studium.kit.edu/login.php | ||||||
|  |         url = f"{self._ilias_url}/shib_login.php" | ||||||
|  |         async with sess.get(url) as response: | ||||||
|  |             shib_url = response.url | ||||||
|  |             if str(shib_url).startswith(self._ilias_url): | ||||||
|  |                 log.explain( | ||||||
|  |                     "ILIAS recognized our shib token and logged us in in the background, returning" | ||||||
|  |                 ) | ||||||
|  |                 return | ||||||
|  |             soup: BeautifulSoup = soupify(await response.read()) | ||||||
|  |  | ||||||
|  |         # Attempt to login using credentials, if necessary | ||||||
|  |         while not self._login_successful(soup): | ||||||
|  |             # Searching the form here so that this fails before asking for | ||||||
|  |             # credentials rather than after asking. | ||||||
|  |             form = cast(Tag, soup.find("form", {"method": "post"})) | ||||||
|  |             action = cast(str, form["action"]) | ||||||
|  |  | ||||||
|  |             # Equivalent: Enter credentials in | ||||||
|  |             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||||
|  |             url = str(shib_url.origin()) + action | ||||||
|  |             username, password = await self._auth.credentials() | ||||||
|  |             data = { | ||||||
|  |                 "_eventId_proceed": "", | ||||||
|  |                 "j_username": username, | ||||||
|  |                 "j_password": password, | ||||||
|  |                 "fudis_web_authn_assertion_input": "", | ||||||
|  |             } | ||||||
|  |             if csrf_token_input := form.find("input", {"name": "csrf_token"}): | ||||||
|  |                 data["csrf_token"] = csrf_token_input["value"]  # type: ignore | ||||||
|  |             soup = await _post(sess, url, data) | ||||||
|  |  | ||||||
|  |             if soup.find(id="attributeRelease"): | ||||||
|  |                 raise CrawlError( | ||||||
|  |                     "ILIAS Shibboleth entitlements changed! " | ||||||
|  |                     "Please log in once in your browser and review them" | ||||||
|  |                 ) | ||||||
|  |  | ||||||
|  |             if self._tfa_required(soup): | ||||||
|  |                 soup = await self._authenticate_tfa(sess, soup, shib_url) | ||||||
|  |  | ||||||
|  |             if not self._login_successful(soup): | ||||||
|  |                 self._auth.invalidate_credentials() | ||||||
|  |  | ||||||
|  |         # Equivalent: Being redirected via JS automatically | ||||||
|  |         # (or clicking "Continue" if you have JS disabled) | ||||||
|  |         relay_state = cast(Tag, soup.find("input", {"name": "RelayState"})) | ||||||
|  |         saml_response = cast(Tag, soup.find("input", {"name": "SAMLResponse"})) | ||||||
|  |         url = form = soup.find("form", {"method": "post"})["action"]  # type: ignore | ||||||
|  |         data = {  # using the info obtained in the while loop above | ||||||
|  |             "RelayState": cast(str, relay_state["value"]), | ||||||
|  |             "SAMLResponse": cast(str, saml_response["value"]), | ||||||
|  |         } | ||||||
|  |         await sess.post(cast(str, url), data=data) | ||||||
|  |  | ||||||
|  |     async def _authenticate_tfa( | ||||||
|  |         self, session: aiohttp.ClientSession, soup: BeautifulSoup, shib_url: yarl.URL | ||||||
|  |     ) -> BeautifulSoup: | ||||||
|  |         if not self._tfa_auth: | ||||||
|  |             self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") | ||||||
|  |  | ||||||
|  |         tfa_token = await self._tfa_auth.password() | ||||||
|  |  | ||||||
|  |         # Searching the form here so that this fails before asking for | ||||||
|  |         # credentials rather than after asking. | ||||||
|  |         form = cast(Tag, soup.find("form", {"method": "post"})) | ||||||
|  |         action = cast(str, form["action"]) | ||||||
|  |  | ||||||
|  |         # Equivalent: Enter token in | ||||||
|  |         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||||
|  |         url = str(shib_url.origin()) + action | ||||||
|  |         username, password = await self._auth.credentials() | ||||||
|  |         data = { | ||||||
|  |             "_eventId_proceed": "", | ||||||
|  |             "fudis_otp_input": tfa_token, | ||||||
|  |         } | ||||||
|  |         if csrf_token_input := form.find("input", {"name": "csrf_token"}): | ||||||
|  |             data["csrf_token"] = csrf_token_input["value"]  # type: ignore | ||||||
|  |         return await _post(session, url, data) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _login_successful(soup: BeautifulSoup) -> bool: | ||||||
|  |         relay_state = soup.find("input", {"name": "RelayState"}) | ||||||
|  |         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||||
|  |         return relay_state is not None and saml_response is not None | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _tfa_required(soup: BeautifulSoup) -> bool: | ||||||
|  |         return soup.find(id="fudiscr-form") is not None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: | ||||||
|  |     async with session.post(url, data=data) as response: | ||||||
|  |         return soupify(await response.read()) | ||||||
							
								
								
									
										188
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										188
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,188 @@ | |||||||
|  | import os | ||||||
|  | import re | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from datetime import datetime | ||||||
|  | from pathlib import PurePath | ||||||
|  | from typing import Any, Awaitable, Generator, Iterable, List, Optional, Pattern, Tuple, Union, cast | ||||||
|  | from urllib.parse import urljoin | ||||||
|  |  | ||||||
|  | from bs4 import BeautifulSoup, Tag | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from ..logging import ProgressBar, log | ||||||
|  | from ..output_dir import FileSink | ||||||
|  | from ..utils import soupify | ||||||
|  | from .crawler import CrawlError | ||||||
|  | from .http_crawler import HttpCrawler, HttpCrawlerSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIpdCrawlerSection(HttpCrawlerSection): | ||||||
|  |     def target(self) -> str: | ||||||
|  |         target = self.s.get("target") | ||||||
|  |         if not target: | ||||||
|  |             self.missing_value("target") | ||||||
|  |  | ||||||
|  |         if not target.startswith("https://"): | ||||||
|  |             self.invalid_value("target", target, "Should be a URL") | ||||||
|  |  | ||||||
|  |         return target | ||||||
|  |  | ||||||
|  |     def link_regex(self) -> Pattern[str]: | ||||||
|  |         regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") | ||||||
|  |         return re.compile(regex) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class KitIpdFile: | ||||||
|  |     name: str | ||||||
|  |     url: str | ||||||
|  |  | ||||||
|  |     def explain(self) -> None: | ||||||
|  |         log.explain(f"File {self.name!r} (href={self.url!r})") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class KitIpdFolder: | ||||||
|  |     name: str | ||||||
|  |     entries: List[Union[KitIpdFile, "KitIpdFolder"]] | ||||||
|  |  | ||||||
|  |     def explain(self) -> None: | ||||||
|  |         log.explain_topic(f"Folder {self.name!r}") | ||||||
|  |         for entry in self.entries: | ||||||
|  |             entry.explain() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIpdCrawler(HttpCrawler): | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: KitIpdCrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |     ): | ||||||
|  |         super().__init__(name, section, config) | ||||||
|  |         self._url = section.target() | ||||||
|  |         self._file_regex = section.link_regex() | ||||||
|  |  | ||||||
|  |     async def _run(self) -> None: | ||||||
|  |         maybe_cl = await self.crawl(PurePath(".")) | ||||||
|  |         if not maybe_cl: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         tasks: List[Awaitable[None]] = [] | ||||||
|  |  | ||||||
|  |         async with maybe_cl: | ||||||
|  |             for item in await self._fetch_items(): | ||||||
|  |                 item.explain() | ||||||
|  |                 if isinstance(item, KitIpdFolder): | ||||||
|  |                     tasks.append(self._crawl_folder(PurePath("."), item)) | ||||||
|  |                 else: | ||||||
|  |                     log.explain_topic(f"Orphan file {item.name!r} (href={item.url!r})") | ||||||
|  |                     log.explain("Attributing it to root folder") | ||||||
|  |                     # do this here to at least be sequential and not parallel (rate limiting is hard, as the | ||||||
|  |                     # crawl abstraction does not hold for these requests) | ||||||
|  |                     etag, mtime = await self._request_resource_version(item.url) | ||||||
|  |                     tasks.append(self._download_file(PurePath("."), item, etag, mtime)) | ||||||
|  |  | ||||||
|  |         await self.gather(tasks) | ||||||
|  |  | ||||||
|  |     async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None: | ||||||
|  |         path = parent / folder.name | ||||||
|  |         if not await self.crawl(path): | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         tasks = [] | ||||||
|  |         for entry in folder.entries: | ||||||
|  |             if isinstance(entry, KitIpdFolder): | ||||||
|  |                 tasks.append(self._crawl_folder(path, entry)) | ||||||
|  |             else: | ||||||
|  |                 # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl | ||||||
|  |                 # abstraction does not hold for these requests) | ||||||
|  |                 etag, mtime = await self._request_resource_version(entry.url) | ||||||
|  |                 tasks.append(self._download_file(path, entry, etag, mtime)) | ||||||
|  |  | ||||||
|  |         await self.gather(tasks) | ||||||
|  |  | ||||||
|  |     async def _download_file( | ||||||
|  |         self, | ||||||
|  |         parent: PurePath, | ||||||
|  |         file: KitIpdFile, | ||||||
|  |         etag: Optional[str], | ||||||
|  |         mtime: Optional[datetime] | ||||||
|  |     ) -> None: | ||||||
|  |         element_path = parent / file.name | ||||||
|  |  | ||||||
|  |         prev_etag = self._get_previous_etag_from_report(element_path) | ||||||
|  |         etag_differs = None if prev_etag is None else prev_etag != etag | ||||||
|  |  | ||||||
|  |         maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime) | ||||||
|  |         if not maybe_dl: | ||||||
|  |             # keep storing the known file's etag | ||||||
|  |             if prev_etag: | ||||||
|  |                 self._add_etag_to_report(element_path, prev_etag) | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         async with maybe_dl as (bar, sink): | ||||||
|  |             await self._stream_from_url(file.url, element_path, sink, bar) | ||||||
|  |  | ||||||
|  |     async def _fetch_items(self) -> Iterable[Union[KitIpdFile, KitIpdFolder]]: | ||||||
|  |         page, url = await self.get_page() | ||||||
|  |         elements: List[Tag] = self._find_file_links(page) | ||||||
|  |  | ||||||
|  |         # do not add unnecessary nesting for a single <h1> heading | ||||||
|  |         drop_h1: bool = len(page.find_all(name="h1")) <= 1 | ||||||
|  |  | ||||||
|  |         folder_tree: KitIpdFolder = KitIpdFolder(".", []) | ||||||
|  |         for element in elements: | ||||||
|  |             parent = HttpCrawler.get_folder_structure_from_heading_hierarchy(element, drop_h1) | ||||||
|  |             file = self._extract_file(element, url) | ||||||
|  |  | ||||||
|  |             current_folder: KitIpdFolder = folder_tree | ||||||
|  |             for folder_name in parent.parts: | ||||||
|  |                 # helps the type checker to verify that current_folder is indeed a folder | ||||||
|  |                 def subfolders() -> Generator[KitIpdFolder, Any, None]: | ||||||
|  |                     return (entry for entry in current_folder.entries if isinstance(entry, KitIpdFolder)) | ||||||
|  |  | ||||||
|  |                 if not any(entry.name == folder_name for entry in subfolders()): | ||||||
|  |                     current_folder.entries.append(KitIpdFolder(folder_name, [])) | ||||||
|  |                 current_folder = next(entry for entry in subfolders() if entry.name == folder_name) | ||||||
|  |  | ||||||
|  |             current_folder.entries.append(file) | ||||||
|  |  | ||||||
|  |         return folder_tree.entries | ||||||
|  |  | ||||||
|  |     def _extract_file(self, link: Tag, url: str) -> KitIpdFile: | ||||||
|  |         url = self._abs_url_from_link(url, link) | ||||||
|  |         name = os.path.basename(url) | ||||||
|  |         return KitIpdFile(name, url) | ||||||
|  |  | ||||||
|  |     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> list[Tag]: | ||||||
|  |         return cast(list[Tag], tag.find_all(name="a", attrs={"href": self._file_regex})) | ||||||
|  |  | ||||||
|  |     def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: | ||||||
|  |         return urljoin(url, cast(str, link_tag.get("href"))) | ||||||
|  |  | ||||||
|  |     async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: | ||||||
|  |         async with self.session.get(url, allow_redirects=False) as resp: | ||||||
|  |             if resp.status == 403: | ||||||
|  |                 raise CrawlError("Received a 403. Are you within the KIT network/VPN?") | ||||||
|  |             if resp.content_length: | ||||||
|  |                 bar.set_total(resp.content_length) | ||||||
|  |  | ||||||
|  |             async for data in resp.content.iter_chunked(1024): | ||||||
|  |                 sink.file.write(data) | ||||||
|  |                 bar.advance(len(data)) | ||||||
|  |  | ||||||
|  |             sink.done() | ||||||
|  |  | ||||||
|  |             self._add_etag_to_report(path, resp.headers.get("ETag")) | ||||||
|  |  | ||||||
|  |     async def get_page(self) -> Tuple[BeautifulSoup, str]: | ||||||
|  |         async with self.session.get(self._url) as request: | ||||||
|  |             # The web page for Algorithmen für Routenplanung contains some | ||||||
|  |             # weird comments that beautifulsoup doesn't parse correctly. This | ||||||
|  |             # hack enables those pages to be crawled, and should hopefully not | ||||||
|  |             # cause issues on other pages. | ||||||
|  |             content = (await request.read()).decode("utf-8") | ||||||
|  |             content = re.sub(r"<!--.*?-->", "", content) | ||||||
|  |             return soupify(content.encode("utf-8")), str(request.url) | ||||||
							
								
								
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,117 @@ | |||||||
|  | import asyncio | ||||||
|  | import datetime | ||||||
|  | import random | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Optional | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from .crawler import Crawler, CrawlerSection, anoncritical | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class LocalCrawlerSection(CrawlerSection): | ||||||
|  |     def target(self) -> Path: | ||||||
|  |         value = self.s.get("target") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("target") | ||||||
|  |         return Path(value).expanduser() | ||||||
|  |  | ||||||
|  |     def crawl_delay(self) -> float: | ||||||
|  |         value = self.s.getfloat("crawl_delay", fallback=0.0) | ||||||
|  |         if value < 0: | ||||||
|  |             self.invalid_value("crawl_delay", value, | ||||||
|  |                                "Must not be negative") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def download_delay(self) -> float: | ||||||
|  |         value = self.s.getfloat("download_delay", fallback=0.0) | ||||||
|  |         if value < 0: | ||||||
|  |             self.invalid_value("download_delay", value, | ||||||
|  |                                "Must not be negative") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def download_speed(self) -> Optional[int]: | ||||||
|  |         value = self.s.getint("download_speed") | ||||||
|  |         if value is not None and value <= 0: | ||||||
|  |             self.invalid_value("download_speed", value, | ||||||
|  |                                "Must be greater than 0") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class LocalCrawler(Crawler): | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: LocalCrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |     ): | ||||||
|  |         super().__init__(name, section, config) | ||||||
|  |  | ||||||
|  |         self._target = config.default_section.working_dir() / section.target() | ||||||
|  |         self._crawl_delay = section.crawl_delay() | ||||||
|  |         self._download_delay = section.download_delay() | ||||||
|  |         self._download_speed = section.download_speed() | ||||||
|  |  | ||||||
|  |         if self._download_speed: | ||||||
|  |             self._block_size = self._download_speed // 10 | ||||||
|  |         else: | ||||||
|  |             self._block_size = 1024**2  # 1 MiB | ||||||
|  |  | ||||||
|  |     async def _run(self) -> None: | ||||||
|  |         await self._crawl_path(self._target, PurePath()) | ||||||
|  |  | ||||||
|  |     @anoncritical | ||||||
|  |     async def _crawl_path(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         if path.is_dir(): | ||||||
|  |             await self._crawl_dir(path, pure) | ||||||
|  |         elif path.is_file(): | ||||||
|  |             await self._crawl_file(path, pure) | ||||||
|  |  | ||||||
|  |     async def _crawl_dir(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         cl = await self.crawl(pure) | ||||||
|  |         if not cl: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         tasks = [] | ||||||
|  |  | ||||||
|  |         async with cl: | ||||||
|  |             await asyncio.sleep(random.uniform( | ||||||
|  |                 0.5 * self._crawl_delay, | ||||||
|  |                 self._crawl_delay, | ||||||
|  |             )) | ||||||
|  |  | ||||||
|  |             for child in path.iterdir(): | ||||||
|  |                 pure_child = cl.path / child.name | ||||||
|  |                 tasks.append(self._crawl_path(child, pure_child)) | ||||||
|  |  | ||||||
|  |         await self.gather(tasks) | ||||||
|  |  | ||||||
|  |     async def _crawl_file(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         stat = path.stat() | ||||||
|  |         mtime = datetime.datetime.fromtimestamp(stat.st_mtime) | ||||||
|  |         dl = await self.download(pure, mtime=mtime) | ||||||
|  |         if not dl: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         async with dl as (bar, sink): | ||||||
|  |             await asyncio.sleep(random.uniform( | ||||||
|  |                 0.5 * self._download_delay, | ||||||
|  |                 self._download_delay, | ||||||
|  |             )) | ||||||
|  |  | ||||||
|  |             bar.set_total(stat.st_size) | ||||||
|  |  | ||||||
|  |             with open(path, "rb") as f: | ||||||
|  |                 while True: | ||||||
|  |                     data = f.read(self._block_size) | ||||||
|  |                     if len(data) == 0: | ||||||
|  |                         break | ||||||
|  |  | ||||||
|  |                     sink.file.write(data) | ||||||
|  |                     bar.advance(len(data)) | ||||||
|  |  | ||||||
|  |                     if self._download_speed: | ||||||
|  |                         delay = self._block_size / self._download_speed | ||||||
|  |                         delay = random.uniform(0.8 * delay, 1.2 * delay) | ||||||
|  |                         await asyncio.sleep(delay) | ||||||
|  |  | ||||||
|  |                 sink.done() | ||||||
							
								
								
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,85 @@ | |||||||
|  | from pathlib import PurePath | ||||||
|  | from typing import Iterator, Set | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def name_variants(path: PurePath) -> Iterator[PurePath]: | ||||||
|  |     separator = " " if " " in path.stem else "_" | ||||||
|  |     i = 1 | ||||||
|  |     while True: | ||||||
|  |         yield path.parent / f"{path.stem}{separator}{i}{path.suffix}" | ||||||
|  |         i += 1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Deduplicator: | ||||||
|  |     FORBIDDEN_CHARS = '<>:"/\\|?*' + "".join([chr(i) for i in range(0, 32)]) | ||||||
|  |     FORBIDDEN_NAMES = { | ||||||
|  |         "CON", "PRN", "AUX", "NUL", | ||||||
|  |         "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", | ||||||
|  |         "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     def __init__(self, windows_paths: bool) -> None: | ||||||
|  |         self._windows_paths = windows_paths | ||||||
|  |  | ||||||
|  |         self._known: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |     def _add(self, path: PurePath) -> None: | ||||||
|  |         self._known.add(path) | ||||||
|  |  | ||||||
|  |         # The last parent is just "." | ||||||
|  |         for parent in list(path.parents)[:-1]: | ||||||
|  |             self._known.add(parent) | ||||||
|  |  | ||||||
|  |     def _fixup_element(self, name: str) -> str: | ||||||
|  |         # For historical reasons, windows paths have some odd restrictions that | ||||||
|  |         # we're trying to avoid. See: | ||||||
|  |         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file | ||||||
|  |  | ||||||
|  |         for char in self.FORBIDDEN_CHARS: | ||||||
|  |             name = name.replace(char, "_") | ||||||
|  |  | ||||||
|  |         path = PurePath(name) | ||||||
|  |         if path.stem in self.FORBIDDEN_NAMES: | ||||||
|  |             name = f"{path.stem}_{path.suffix}" | ||||||
|  |  | ||||||
|  |         if name.endswith(" ") or name.endswith("."): | ||||||
|  |             name += "_" | ||||||
|  |  | ||||||
|  |         return name | ||||||
|  |  | ||||||
|  |     def _fixup_for_windows(self, path: PurePath) -> PurePath: | ||||||
|  |         new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts]) | ||||||
|  |         if new_path != path: | ||||||
|  |             log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") | ||||||
|  |         return new_path | ||||||
|  |  | ||||||
|  |     def fixup_path(self, path: PurePath) -> PurePath: | ||||||
|  |         """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise.""" | ||||||
|  |         if self._windows_paths: | ||||||
|  |             return self._fixup_for_windows(path) | ||||||
|  |         return path | ||||||
|  |  | ||||||
|  |     def mark(self, path: PurePath) -> PurePath: | ||||||
|  |         if self._windows_paths: | ||||||
|  |             path = self._fixup_for_windows(path) | ||||||
|  |  | ||||||
|  |         if path not in self._known: | ||||||
|  |             self._add(path) | ||||||
|  |             return path | ||||||
|  |  | ||||||
|  |         log.explain(f"Path {fmt_path(path)} is already taken, finding a new name") | ||||||
|  |  | ||||||
|  |         for variant in name_variants(path): | ||||||
|  |             if variant in self._known: | ||||||
|  |                 log.explain(f"Path {fmt_path(variant)} is taken as well") | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             log.explain(f"Found unused path {fmt_path(variant)}") | ||||||
|  |             self._add(variant) | ||||||
|  |             return variant | ||||||
|  |  | ||||||
|  |         # The "name_variants" iterator returns infinitely many paths | ||||||
|  |         raise RuntimeError("Unreachable") | ||||||
							
								
								
									
										169
									
								
								PFERD/diva.py
									
									
									
									
									
								
							
							
						
						
									
										169
									
								
								PFERD/diva.py
									
									
									
									
									
								
							| @@ -1,169 +0,0 @@ | |||||||
| """ |  | ||||||
| Utility functions and a scraper/downloader for the KIT DIVA portal. |  | ||||||
| """ |  | ||||||
| import logging |  | ||||||
| import re |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Any, Callable, List, Optional |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .errors import FatalException |  | ||||||
| from .logging import PrettyLogger |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import Transformable |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class DivaDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     Information about a DIVA video |  | ||||||
|     """ |  | ||||||
|     url: str |  | ||||||
|  |  | ||||||
|  |  | ||||||
| DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool: |  | ||||||
|     """ |  | ||||||
|     Accepts only new files. |  | ||||||
|     """ |  | ||||||
|     resolved_file = organizer.resolve(info.path) |  | ||||||
|     if not resolved_file.exists(): |  | ||||||
|         return True |  | ||||||
|     PRETTY.ignored_file(info.path, "local file exists") |  | ||||||
|     return False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DivaPlaylistCrawler: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     A crawler for DIVA playlists. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     _PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/" |  | ||||||
|     _COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json" |  | ||||||
|  |  | ||||||
|     def __init__(self, playlist_id: str): |  | ||||||
|         self._id = playlist_id |  | ||||||
|  |  | ||||||
|     @classmethod |  | ||||||
|     def fetch_id(cls, playlist_link: str) -> str: |  | ||||||
|         """ |  | ||||||
|         Fetches the ID for a playerlist, given the base link |  | ||||||
|         (e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271). |  | ||||||
|  |  | ||||||
|         Raises a FatalException, if the id can not be resolved |  | ||||||
|         """ |  | ||||||
|         match = re.match(r".+#/details/(.+)", playlist_link) |  | ||||||
|         if match is None: |  | ||||||
|             raise FatalException( |  | ||||||
|                 "DIVA: Invalid playlist link format, could not extract details." |  | ||||||
|             ) |  | ||||||
|         base_name = match.group(1) |  | ||||||
|  |  | ||||||
|         response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json") |  | ||||||
|  |  | ||||||
|         if response.status_code != 200: |  | ||||||
|             raise FatalException( |  | ||||||
|                 f"DIVA: Got non-200 status code ({response.status_code}))" |  | ||||||
|                 f"when requesting {response.url!r}!" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         body = response.json() |  | ||||||
|  |  | ||||||
|         if body["error"]: |  | ||||||
|             raise FatalException(f"DIVA: Server returned error {body['error']!r}.") |  | ||||||
|  |  | ||||||
|         return body["result"]["collection"]["id"] |  | ||||||
|  |  | ||||||
|     def crawl(self) -> List[DivaDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls the playlist given in the constructor. |  | ||||||
|         """ |  | ||||||
|         response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) |  | ||||||
|         if response.status_code != 200: |  | ||||||
|             raise FatalException(f"Server returned status {response.status_code}.") |  | ||||||
|  |  | ||||||
|         body = response.json() |  | ||||||
|  |  | ||||||
|         if body["error"]: |  | ||||||
|             raise FatalException(f"Server returned error {body['error']!r}.") |  | ||||||
|  |  | ||||||
|         result = body["result"] |  | ||||||
|  |  | ||||||
|         if result["resultCount"] > result["pageSize"]: |  | ||||||
|             PRETTY.warning("Did not receive all results, some will be missing") |  | ||||||
|  |  | ||||||
|         download_infos: List[DivaDownloadInfo] = [] |  | ||||||
|  |  | ||||||
|         for video in result["resultList"]: |  | ||||||
|             title = video["title"] |  | ||||||
|             collection_title = self._follow_path(["collection", "title"], video) |  | ||||||
|             url = self._follow_path( |  | ||||||
|                 ["resourceList", "derivateList", "mp4", "url"], |  | ||||||
|                 video |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|             if url and collection_title and title: |  | ||||||
|                 path = Path(collection_title, title + ".mp4") |  | ||||||
|                 download_infos.append(DivaDownloadInfo(path, url)) |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}") |  | ||||||
|  |  | ||||||
|         return download_infos |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _follow_path(path: List[str], obj: Any) -> Optional[Any]: |  | ||||||
|         """ |  | ||||||
|         Follows a property path through an object, bailing at the first None. |  | ||||||
|         """ |  | ||||||
|         current = obj |  | ||||||
|         for path_step in path: |  | ||||||
|             if path_step in current: |  | ||||||
|                 current = current[path_step] |  | ||||||
|             else: |  | ||||||
|                 return None |  | ||||||
|         return current |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DivaDownloader: |  | ||||||
|     """ |  | ||||||
|     A downloader for DIVA videos. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy): |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._strategy = strategy |  | ||||||
|         self._session = requests.session() |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[DivaDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: DivaDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a single file. |  | ||||||
|         """ |  | ||||||
|         if not self._strategy(self._organizer, info): |  | ||||||
|             self._organizer.mark(info.path) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         with self._session.get(info.url, stream=True) as response: |  | ||||||
|             if response.status_code == 200: |  | ||||||
|                 tmp_file = self._tmp_dir.new_path() |  | ||||||
|                 stream_to_path(response, tmp_file, info.path.name) |  | ||||||
|                 self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Could not download file, got response {response.status_code}") |  | ||||||
| @@ -1,75 +0,0 @@ | |||||||
| """ |  | ||||||
| Provides a summary that keeps track of new modified or deleted files. |  | ||||||
| """ |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import List |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _mergeNoDuplicate(first: List[Path], second: List[Path]) -> List[Path]: |  | ||||||
|     tmp = list(set(first + second)) |  | ||||||
|     tmp.sort(key=lambda x: str(x.resolve())) |  | ||||||
|     return tmp |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DownloadSummary: |  | ||||||
|     """ |  | ||||||
|     Keeps track of all new, modified or deleted files and provides a summary. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self) -> None: |  | ||||||
|         self._new_files: List[Path] = [] |  | ||||||
|         self._modified_files: List[Path] = [] |  | ||||||
|         self._deleted_files: List[Path] = [] |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def new_files(self) -> List[Path]: |  | ||||||
|         """ |  | ||||||
|         Returns all new files. |  | ||||||
|         """ |  | ||||||
|         return self._new_files.copy() |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def modified_files(self) -> List[Path]: |  | ||||||
|         """ |  | ||||||
|         Returns all modified files. |  | ||||||
|         """ |  | ||||||
|         return self._modified_files.copy() |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def deleted_files(self) -> List[Path]: |  | ||||||
|         """ |  | ||||||
|         Returns all deleted files. |  | ||||||
|         """ |  | ||||||
|         return self._deleted_files.copy() |  | ||||||
|  |  | ||||||
|     def merge(self, summary: 'DownloadSummary') -> None: |  | ||||||
|         """ |  | ||||||
|         Merges ourselves with the passed summary. Modifies this object, but not the passed one. |  | ||||||
|         """ |  | ||||||
|         self._new_files = _mergeNoDuplicate(self._new_files, summary.new_files) |  | ||||||
|         self._modified_files = _mergeNoDuplicate(self._modified_files, summary.modified_files) |  | ||||||
|         self._deleted_files = _mergeNoDuplicate(self._deleted_files, summary.deleted_files) |  | ||||||
|  |  | ||||||
|     def add_deleted_file(self, path: Path) -> None: |  | ||||||
|         """ |  | ||||||
|         Registers a file as deleted. |  | ||||||
|         """ |  | ||||||
|         self._deleted_files.append(path) |  | ||||||
|  |  | ||||||
|     def add_modified_file(self, path: Path) -> None: |  | ||||||
|         """ |  | ||||||
|         Registers a file as changed. |  | ||||||
|         """ |  | ||||||
|         self._modified_files.append(path) |  | ||||||
|  |  | ||||||
|     def add_new_file(self, path: Path) -> None: |  | ||||||
|         """ |  | ||||||
|         Registers a file as new. |  | ||||||
|         """ |  | ||||||
|         self._new_files.append(path) |  | ||||||
|  |  | ||||||
|     def has_updates(self) -> bool: |  | ||||||
|         """ |  | ||||||
|         Returns whether this summary has any updates. |  | ||||||
|         """ |  | ||||||
|         return bool(self._new_files or self._modified_files or self._deleted_files) |  | ||||||
| @@ -1,72 +0,0 @@ | |||||||
| """ |  | ||||||
| General downloaders useful in many situations |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from dataclasses import dataclass, field |  | ||||||
| from typing import Any, Dict, List, Optional |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
| import requests.auth |  | ||||||
|  |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import Transformable |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class HttpDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     This class describes a single file to be downloaded. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     url: str |  | ||||||
|     parameters: Dict[str, Any] = field(default_factory=dict) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class HttpDownloader: |  | ||||||
|     """A HTTP downloader that can handle HTTP basic auth.""" |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             tmp_dir: TmpDir, |  | ||||||
|             organizer: Organizer, |  | ||||||
|             username: Optional[str], |  | ||||||
|             password: Optional[str], |  | ||||||
|     ): |  | ||||||
|         """Create a new http downloader.""" |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._username = username |  | ||||||
|         self._password = password |  | ||||||
|         self._session = self._build_session() |  | ||||||
|  |  | ||||||
|     def _build_session(self) -> requests.Session: |  | ||||||
|         session = requests.Session() |  | ||||||
|         if self._username and self._password: |  | ||||||
|             session.auth = requests.auth.HTTPBasicAuth( |  | ||||||
|                 self._username, self._password |  | ||||||
|             ) |  | ||||||
|         return session |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[HttpDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: HttpDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a single file. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         with self._session.get(info.url, params=info.parameters, stream=True) as response: |  | ||||||
|             if response.status_code == 200: |  | ||||||
|                 tmp_file = self._tmp_dir.new_path() |  | ||||||
|                 stream_to_path(response, tmp_file, info.path.name) |  | ||||||
|                 self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|             else: |  | ||||||
|                 # TODO use proper exception |  | ||||||
|                 raise Exception(f"Could not download file, got response {response.status_code}") |  | ||||||
| @@ -1,39 +0,0 @@ | |||||||
| """ |  | ||||||
| An error logging decorator. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| from typing import Any, Callable, TypeVar, cast |  | ||||||
|  |  | ||||||
| from rich.console import Console |  | ||||||
|  |  | ||||||
| from .logging import PrettyLogger |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FatalException(Exception): |  | ||||||
|     """ |  | ||||||
|     A fatal exception occurred. Recovery is not possible. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| TFun = TypeVar('TFun', bound=Callable[..., Any]) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def swallow_and_print_errors(function: TFun) -> TFun: |  | ||||||
|     """ |  | ||||||
|     Decorates a function, swallows all errors, logs them and returns none if one occurred. |  | ||||||
|     """ |  | ||||||
|     def inner(*args: Any, **kwargs: Any) -> Any: |  | ||||||
|         # pylint: disable=broad-except |  | ||||||
|         try: |  | ||||||
|             return function(*args, **kwargs) |  | ||||||
|         except FatalException as error: |  | ||||||
|             PRETTY.error(str(error)) |  | ||||||
|             return None |  | ||||||
|         except Exception as error: |  | ||||||
|             Console().print_exception() |  | ||||||
|             return None |  | ||||||
|     return cast(TFun, inner) |  | ||||||
| @@ -1,10 +0,0 @@ | |||||||
| """ |  | ||||||
| Synchronizing files from ILIAS instances (https://www.ilias.de/). |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator |  | ||||||
| from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, |  | ||||||
|                       IliasElementType) |  | ||||||
| from .downloader import (IliasDownloader, IliasDownloadInfo, |  | ||||||
|                          IliasDownloadStrategy, download_everything, |  | ||||||
|                          download_modified_or_new) |  | ||||||
| @@ -1,131 +0,0 @@ | |||||||
| """ |  | ||||||
| Authenticators that can obtain proper ILIAS session cookies. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import abc |  | ||||||
| import logging |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from ..authenticators import TfaAuthenticator, UserPassAuthenticator |  | ||||||
| from ..utils import soupify |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasAuthenticator(abc.ABC): |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|  |  | ||||||
|     """ |  | ||||||
|     An authenticator that logs an existing requests session into an ILIAS |  | ||||||
|     account. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     @abc.abstractmethod |  | ||||||
|     def authenticate(self, sess: requests.Session) -> None: |  | ||||||
|         """ |  | ||||||
|         Log a requests session into this authenticator's ILIAS account. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class KitShibbolethAuthenticator(IliasAuthenticator): |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|  |  | ||||||
|     """ |  | ||||||
|     Authenticate via KIT's shibboleth system. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, username: Optional[str] = None, password: Optional[str] = None) -> None: |  | ||||||
|         self._auth = UserPassAuthenticator("KIT ILIAS Shibboleth", username, password) |  | ||||||
|         self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") |  | ||||||
|  |  | ||||||
|     def authenticate(self, sess: requests.Session) -> None: |  | ||||||
|         """ |  | ||||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login |  | ||||||
|         cookies it receieves. |  | ||||||
|  |  | ||||||
|         This function should only be called whenever it is detected that you're |  | ||||||
|         not logged in. The cookies obtained should be good for a few minutes, |  | ||||||
|         maybe even an hour or two. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in |  | ||||||
|         # https://ilias.studium.kit.edu/login.php |  | ||||||
|         LOGGER.debug("Begin authentication process with ILIAS") |  | ||||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" |  | ||||||
|         data = { |  | ||||||
|             "sendLogin": "1", |  | ||||||
|             "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", |  | ||||||
|             "target": "/shib_login.php", |  | ||||||
|             "home_organization_selection": "Mit KIT-Account anmelden", |  | ||||||
|         } |  | ||||||
|         soup = soupify(sess.post(url, data=data)) |  | ||||||
|  |  | ||||||
|         # Attempt to login using credentials, if necessary |  | ||||||
|         while not self._login_successful(soup): |  | ||||||
|             # Searching the form here so that this fails before asking for |  | ||||||
|             # credentials rather than after asking. |  | ||||||
|             form = soup.find("form", {"class": "full content", "method": "post"}) |  | ||||||
|             action = form["action"] |  | ||||||
|  |  | ||||||
|             # Equivalent: Enter credentials in |  | ||||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO |  | ||||||
|             LOGGER.debug("Attempt to log in to Shibboleth using credentials") |  | ||||||
|             url = "https://idp.scc.kit.edu" + action |  | ||||||
|             data = { |  | ||||||
|                 "_eventId_proceed": "", |  | ||||||
|                 "j_username": self._auth.username, |  | ||||||
|                 "j_password": self._auth.password, |  | ||||||
|             } |  | ||||||
|             soup = soupify(sess.post(url, data=data)) |  | ||||||
|  |  | ||||||
|             if self._tfa_required(soup): |  | ||||||
|                 soup = self._authenticate_tfa(sess, soup) |  | ||||||
|  |  | ||||||
|             if not self._login_successful(soup): |  | ||||||
|                 print("Incorrect credentials.") |  | ||||||
|                 self._auth.invalidate_credentials() |  | ||||||
|  |  | ||||||
|         # Equivalent: Being redirected via JS automatically |  | ||||||
|         # (or clicking "Continue" if you have JS disabled) |  | ||||||
|         LOGGER.debug("Redirect back to ILIAS with login information") |  | ||||||
|         relay_state = soup.find("input", {"name": "RelayState"}) |  | ||||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) |  | ||||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" |  | ||||||
|         data = {  # using the info obtained in the while loop above |  | ||||||
|             "RelayState": relay_state["value"], |  | ||||||
|             "SAMLResponse": saml_response["value"], |  | ||||||
|         } |  | ||||||
|         sess.post(url, data=data) |  | ||||||
|  |  | ||||||
|     def _authenticate_tfa( |  | ||||||
|             self, |  | ||||||
|             session: requests.Session, |  | ||||||
|             soup: bs4.BeautifulSoup |  | ||||||
|     ) -> bs4.BeautifulSoup: |  | ||||||
|         # Searching the form here so that this fails before asking for |  | ||||||
|         # credentials rather than after asking. |  | ||||||
|         form = soup.find("form", {"method": "post"}) |  | ||||||
|         action = form["action"] |  | ||||||
|  |  | ||||||
|         # Equivalent: Enter token in |  | ||||||
|         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO |  | ||||||
|         LOGGER.debug("Attempt to log in to Shibboleth with TFA token") |  | ||||||
|         url = "https://idp.scc.kit.edu" + action |  | ||||||
|         data = { |  | ||||||
|             "_eventId_proceed": "", |  | ||||||
|             "j_tokenNumber": self._tfa_auth.get_token() |  | ||||||
|         } |  | ||||||
|         return soupify(session.post(url, data=data)) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _login_successful(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         relay_state = soup.find("input", {"name": "RelayState"}) |  | ||||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) |  | ||||||
|         return relay_state is not None and saml_response is not None |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _tfa_required(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         return soup.find(id="j_tokenNumber") is not None |  | ||||||
| @@ -1,656 +0,0 @@ | |||||||
| """ |  | ||||||
| Contains an ILIAS crawler alongside helper functions. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import datetime |  | ||||||
| import json |  | ||||||
| import logging |  | ||||||
| import re |  | ||||||
| from enum import Enum |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Any, Callable, Dict, List, Optional, Union |  | ||||||
| from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, |  | ||||||
|                           urlunsplit) |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from ..errors import FatalException |  | ||||||
| from ..logging import PrettyLogger |  | ||||||
| from ..utils import soupify |  | ||||||
| from .authenticators import IliasAuthenticator |  | ||||||
| from .date_demangler import demangle_date |  | ||||||
| from .downloader import IliasDownloadInfo |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _sanitize_path_name(name: str) -> str: |  | ||||||
|     return name.replace("/", "-") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasElementType(Enum): |  | ||||||
|     """ |  | ||||||
|     The type of an ilias element. |  | ||||||
|     """ |  | ||||||
|     REGULAR_FOLDER = "REGULAR_FOLDER" |  | ||||||
|     VIDEO_FOLDER = "VIDEO_FOLDER" |  | ||||||
|     EXERCISE_FOLDER = "EXERCISE_FOLDER" |  | ||||||
|     REGULAR_FILE = "REGULAR_FILE" |  | ||||||
|     VIDEO_FILE = "VIDEO_FILE" |  | ||||||
|     FORUM = "FORUM" |  | ||||||
|     EXTERNAL_LINK = "EXTERNAL_LINK" |  | ||||||
|  |  | ||||||
|     def is_folder(self) -> bool: |  | ||||||
|         """ |  | ||||||
|         Returns whether this type is some kind of folder. |  | ||||||
|         """ |  | ||||||
|         return "FOLDER" in str(self.name) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| IliasDirectoryFilter = Callable[[Path, IliasElementType], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasCrawlerEntry: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     An ILIAS crawler entry used internally to find, catalogue and recursively crawl elements. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             path: Path, |  | ||||||
|             url: Union[str, Callable[[], Optional[str]]], |  | ||||||
|             entry_type: IliasElementType, |  | ||||||
|             modification_date: Optional[datetime.datetime] |  | ||||||
|     ): |  | ||||||
|         self.path = path |  | ||||||
|         if isinstance(url, str): |  | ||||||
|             str_url = url |  | ||||||
|             self.url: Callable[[], Optional[str]] = lambda: str_url |  | ||||||
|         else: |  | ||||||
|             self.url = url |  | ||||||
|         self.entry_type = entry_type |  | ||||||
|         self.modification_date = modification_date |  | ||||||
|  |  | ||||||
|     def to_download_info(self) -> Optional[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Converts this crawler entry to an IliasDownloadInfo, if possible. |  | ||||||
|         This method will only succeed for *File* types. |  | ||||||
|         """ |  | ||||||
|         if self.entry_type in [IliasElementType.REGULAR_FILE, IliasElementType.VIDEO_FILE]: |  | ||||||
|             return IliasDownloadInfo(self.path, self.url, self.modification_date) |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasCrawler: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|  |  | ||||||
|     """ |  | ||||||
|     A crawler for ILIAS. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     # pylint: disable=too-many-arguments |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             base_url: str, |  | ||||||
|             session: requests.Session, |  | ||||||
|             authenticator: IliasAuthenticator, |  | ||||||
|             dir_filter: IliasDirectoryFilter |  | ||||||
|     ): |  | ||||||
|         """ |  | ||||||
|         Create a new ILIAS crawler. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._base_url = base_url |  | ||||||
|         self._session = session |  | ||||||
|         self._authenticator = authenticator |  | ||||||
|         self.dir_filter = dir_filter |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _url_set_query_param(url: str, param: str, value: str) -> str: |  | ||||||
|         """ |  | ||||||
|         Set a query parameter in an url, overwriting existing ones with the same name. |  | ||||||
|         """ |  | ||||||
|         scheme, netloc, path, query, fragment = urlsplit(url) |  | ||||||
|         query_parameters = parse_qs(query) |  | ||||||
|         query_parameters[param] = [value] |  | ||||||
|         new_query_string = urlencode(query_parameters, doseq=True) |  | ||||||
|  |  | ||||||
|         return urlunsplit((scheme, netloc, path, new_query_string, fragment)) |  | ||||||
|  |  | ||||||
|     def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls a given url *and all reachable elements in it*. |  | ||||||
|  |  | ||||||
|         Args: |  | ||||||
|             url {str} -- the *full* url to crawl |  | ||||||
|         """ |  | ||||||
|         start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url) |  | ||||||
|         return self._iterate_entries_to_download_infos(start_entries) |  | ||||||
|  |  | ||||||
|     def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Starts the crawl process for a course, yielding a list of elements to (potentially) |  | ||||||
|         download. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             course_id {str} -- the course id |  | ||||||
|  |  | ||||||
|         Raises: |  | ||||||
|             FatalException: if an unrecoverable error occurs or the course id is not valid |  | ||||||
|         """ |  | ||||||
|         # Start crawling at the given course |  | ||||||
|         root_url = self._url_set_query_param( |  | ||||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         if not self._is_course_id_valid(root_url, course_id): |  | ||||||
|             raise FatalException( |  | ||||||
|                 "Invalid course id? I didn't find anything looking like a course!" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         # And treat it as a folder |  | ||||||
|         entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url) |  | ||||||
|         return self._iterate_entries_to_download_infos(entries) |  | ||||||
|  |  | ||||||
|     def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: |  | ||||||
|         response: requests.Response = self._session.get(root_url) |  | ||||||
|         # We were redirected ==> Non-existant ID |  | ||||||
|         if course_id not in response.url: |  | ||||||
|             return False |  | ||||||
|  |  | ||||||
|         link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link") |  | ||||||
|         if not link_element: |  | ||||||
|             return False |  | ||||||
|         # It wasn't a course but a category list, forum, etc. |  | ||||||
|         return "crs_" in link_element.get("value") |  | ||||||
|  |  | ||||||
|     def find_course_name(self, course_id: str) -> Optional[str]: |  | ||||||
|         """ |  | ||||||
|         Returns the name of a given course. None if it is not a valid course |  | ||||||
|         or it could not be found. |  | ||||||
|         """ |  | ||||||
|         course_url = self._url_set_query_param( |  | ||||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" |  | ||||||
|         ) |  | ||||||
|         return self.find_element_name(course_url) |  | ||||||
|  |  | ||||||
|     def find_element_name(self, url: str) -> Optional[str]: |  | ||||||
|         """ |  | ||||||
|         Returns the name of the element at the given URL, if it can find one. |  | ||||||
|         """ |  | ||||||
|         focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus") |  | ||||||
|         if not focus_element: |  | ||||||
|             return None |  | ||||||
|         return focus_element.text |  | ||||||
|  |  | ||||||
|     def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls the ILIAS personal desktop (and every subelements that can be reached from there). |  | ||||||
|  |  | ||||||
|         Raises: |  | ||||||
|             FatalException: if an unrecoverable error occurs |  | ||||||
|         """ |  | ||||||
|         entries: List[IliasCrawlerEntry] = self._crawl_folder( |  | ||||||
|             Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI" |  | ||||||
|         ) |  | ||||||
|         return self._iterate_entries_to_download_infos(entries) |  | ||||||
|  |  | ||||||
|     def _iterate_entries_to_download_infos( |  | ||||||
|             self, |  | ||||||
|             entries: List[IliasCrawlerEntry] |  | ||||||
|     ) -> List[IliasDownloadInfo]: |  | ||||||
|         result: List[IliasDownloadInfo] = [] |  | ||||||
|         entries_to_process: List[IliasCrawlerEntry] = entries.copy() |  | ||||||
|         while len(entries_to_process) > 0: |  | ||||||
|             entry = entries_to_process.pop() |  | ||||||
|  |  | ||||||
|             if entry.entry_type == IliasElementType.EXTERNAL_LINK: |  | ||||||
|                 PRETTY.not_searching(entry.path, "external link") |  | ||||||
|                 continue |  | ||||||
|             if entry.entry_type == IliasElementType.FORUM: |  | ||||||
|                 PRETTY.not_searching(entry.path, "forum") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type): |  | ||||||
|                 PRETTY.not_searching(entry.path, "user filter") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             download_info = entry.to_download_info() |  | ||||||
|             if download_info is not None: |  | ||||||
|                 result.append(download_info) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             url = entry.url() |  | ||||||
|  |  | ||||||
|             if url is None: |  | ||||||
|                 PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             PRETTY.searching(entry.path) |  | ||||||
|  |  | ||||||
|             if entry.entry_type == IliasElementType.EXERCISE_FOLDER: |  | ||||||
|                 entries_to_process += self._crawl_exercises(entry.path, url) |  | ||||||
|                 continue |  | ||||||
|             if entry.entry_type == IliasElementType.REGULAR_FOLDER: |  | ||||||
|                 entries_to_process += self._crawl_folder(entry.path, url) |  | ||||||
|                 continue |  | ||||||
|             if entry.entry_type == IliasElementType.VIDEO_FOLDER: |  | ||||||
|                 entries_to_process += self._crawl_video_directory(entry.path, url) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|         return result |  | ||||||
|  |  | ||||||
|     def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl all files in a folder-like element. |  | ||||||
|         """ |  | ||||||
|         soup = self._get_page(url, {}) |  | ||||||
|  |  | ||||||
|         if soup.find(id="headerimage"): |  | ||||||
|             element: bs4.Tag = soup.find(id="headerimage") |  | ||||||
|             if "opencast" in element.attrs["src"].lower(): |  | ||||||
|                 PRETTY.warning(f"Switched to crawling a video at {folder_path}") |  | ||||||
|                 if not self.dir_filter(folder_path, IliasElementType.VIDEO_FOLDER): |  | ||||||
|                     PRETTY.not_searching(folder_path, "user filter") |  | ||||||
|                     return [] |  | ||||||
|                 return self._crawl_video_directory(folder_path, url) |  | ||||||
|  |  | ||||||
|         result: List[IliasCrawlerEntry] = [] |  | ||||||
|  |  | ||||||
|         # Fetch all links and throw them to the general interpreter |  | ||||||
|         links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") |  | ||||||
|         for link in links: |  | ||||||
|             abs_url = self._abs_url_from_link(link) |  | ||||||
|             element_path = Path(folder_path, _sanitize_path_name(link.getText().strip())) |  | ||||||
|             element_type = self._find_type_from_link(element_path, link, abs_url) |  | ||||||
|  |  | ||||||
|             if element_type == IliasElementType.REGULAR_FILE: |  | ||||||
|                 result += self._crawl_file(folder_path, link, abs_url) |  | ||||||
|             elif element_type is not None: |  | ||||||
|                 result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Found element without a type at {str(element_path)!r}") |  | ||||||
|  |  | ||||||
|         return result |  | ||||||
|  |  | ||||||
|     def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: |  | ||||||
|         """ |  | ||||||
|         Create an absolute url from an <a> tag. |  | ||||||
|         """ |  | ||||||
|         return urljoin(self._base_url, link_tag.get("href")) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _find_type_from_link( |  | ||||||
|             path: Path, |  | ||||||
|             link_element: bs4.Tag, |  | ||||||
|             url: str |  | ||||||
|     ) -> Optional[IliasElementType]: |  | ||||||
|         """ |  | ||||||
|         Decides which sub crawler to use for a given top level element. |  | ||||||
|         """ |  | ||||||
|         parsed_url = urlparse(url) |  | ||||||
|         LOGGER.debug("Parsed url: %r", parsed_url) |  | ||||||
|  |  | ||||||
|         # file URLs contain "target=file" |  | ||||||
|         if "target=file_" in parsed_url.query: |  | ||||||
|             return IliasElementType.REGULAR_FILE |  | ||||||
|  |  | ||||||
|         # Skip forums |  | ||||||
|         if "cmd=showThreads" in parsed_url.query: |  | ||||||
|             return IliasElementType.FORUM |  | ||||||
|  |  | ||||||
|         # Everything with a ref_id can *probably* be opened to reveal nested things |  | ||||||
|         # video groups, directories, exercises, etc |  | ||||||
|         if "ref_id=" in parsed_url.query: |  | ||||||
|             return IliasCrawler._find_type_from_folder_like(link_element, url) |  | ||||||
|  |  | ||||||
|         PRETTY.warning( |  | ||||||
|             "Got unknown element type in switch. I am not sure what horror I found on the" |  | ||||||
|             f" ILIAS page. The element was at {str(path)!r} and it is {link_element!r})" |  | ||||||
|         ) |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _find_type_from_folder_like(link_element: bs4.Tag, url: str) -> Optional[IliasElementType]: |  | ||||||
|         """ |  | ||||||
|         Try crawling something that looks like a folder. |  | ||||||
|         """ |  | ||||||
|         # pylint: disable=too-many-return-statements |  | ||||||
|  |  | ||||||
|         # We look for the outer div of our inner link, to find information around it |  | ||||||
|         # (mostly the icon) |  | ||||||
|         for parent in link_element.parents: |  | ||||||
|             if "ilContainerListItemOuter" in parent["class"]: |  | ||||||
|                 found_parent = parent |  | ||||||
|                 break |  | ||||||
|  |  | ||||||
|         if found_parent is None: |  | ||||||
|             PRETTY.warning(f"Could not find element icon for {url!r}") |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         # Find the small descriptive icon to figure out the type |  | ||||||
|         img_tag: Optional[bs4.Tag] = found_parent.select_one("img.ilListItemIcon") |  | ||||||
|  |  | ||||||
|         if img_tag is None: |  | ||||||
|             PRETTY.warning(f"Could not find image tag for {url!r}") |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         if "opencast" in str(img_tag["alt"]).lower(): |  | ||||||
|             return IliasElementType.VIDEO_FOLDER |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_exc.svg"): |  | ||||||
|             return IliasElementType.EXERCISE_FOLDER |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_webr.svg"): |  | ||||||
|             return IliasElementType.EXTERNAL_LINK |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("frm.svg"): |  | ||||||
|             return IliasElementType.FORUM |  | ||||||
|  |  | ||||||
|         return IliasElementType.REGULAR_FOLDER |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawls a file. |  | ||||||
|         """ |  | ||||||
|         # Files have a list of properties (type, modification date, size, etc.) |  | ||||||
|         # In a series of divs. |  | ||||||
|         # Find the parent containing all those divs, so we can filter our what we need |  | ||||||
|         properties_parent: bs4.Tag = link_element.findParent( |  | ||||||
|             "div", {"class": lambda x: "il_ContainerListItem" in x} |  | ||||||
|         ).select_one(".il_ItemProperties") |  | ||||||
|         # The first one is always the filetype |  | ||||||
|         file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() |  | ||||||
|  |  | ||||||
|         # The rest does not have a stable order. Grab the whole text and reg-ex the date |  | ||||||
|         # out of it |  | ||||||
|         all_properties_text = properties_parent.getText().strip() |  | ||||||
|         modification_date_match = re.search( |  | ||||||
|             r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", |  | ||||||
|             all_properties_text |  | ||||||
|         ) |  | ||||||
|         if modification_date_match is None: |  | ||||||
|             modification_date = None |  | ||||||
|             PRETTY.warning(f"Could not extract start date from {all_properties_text!r}") |  | ||||||
|         else: |  | ||||||
|             modification_date_str = modification_date_match.group(1) |  | ||||||
|             modification_date = demangle_date(modification_date_str) |  | ||||||
|  |  | ||||||
|         # Grab the name from the link text |  | ||||||
|         name = _sanitize_path_name(link_element.getText()) |  | ||||||
|         full_path = Path(path, name + "." + file_type) |  | ||||||
|  |  | ||||||
|         return [ |  | ||||||
|             IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date) |  | ||||||
|         ] |  | ||||||
|  |  | ||||||
|     def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl the video overview site. |  | ||||||
|         """ |  | ||||||
|         initial_soup = self._get_page(url, {}) |  | ||||||
|  |  | ||||||
|         # The page is actually emtpy but contains a much needed token in the link below. |  | ||||||
|         # That token can be used to fetch the *actual* video listing |  | ||||||
|         content_link: bs4.Tag = initial_soup.select_one("#tab_series a") |  | ||||||
|         # Fetch the actual video listing. The given parameters return all videos (max 800) |  | ||||||
|         # in a standalone html page |  | ||||||
|         video_list_soup = self._get_page( |  | ||||||
|             self._abs_url_from_link(content_link), |  | ||||||
|             {"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         # If we find a page selected, we probably need to respect pagination |  | ||||||
|         if self._is_paginated_video_page(video_list_soup): |  | ||||||
|             second_stage_url = self._abs_url_from_link(content_link) |  | ||||||
|  |  | ||||||
|             return self._crawl_paginated_video_directory( |  | ||||||
|                 video_dir_path, video_list_soup, second_stage_url |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         return self._crawl_video_directory_second_stage(video_dir_path, video_list_soup) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         return soup.find(id=re.compile(r"tab_page_sel.+")) is not None |  | ||||||
|  |  | ||||||
|     def _crawl_paginated_video_directory( |  | ||||||
|             self, |  | ||||||
|             video_dir_path: Path, |  | ||||||
|             paged_video_list_soup: bs4.BeautifulSoup, |  | ||||||
|             second_stage_url: str |  | ||||||
|     ) -> List[IliasCrawlerEntry]: |  | ||||||
|         LOGGER.info("Found paginated video page, trying 800 elements") |  | ||||||
|  |  | ||||||
|         # Try to find the table id. This can be used to build the query parameter indicating |  | ||||||
|         # you want 800 elements |  | ||||||
|  |  | ||||||
|         table_element: bs4.Tag = paged_video_list_soup.find( |  | ||||||
|             name="table", id=re.compile(r"tbl_xoct_.+") |  | ||||||
|         ) |  | ||||||
|         if table_element is None: |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "Could not increase elements per page (table not found)." |  | ||||||
|                 " Some might not be crawled!" |  | ||||||
|             ) |  | ||||||
|             return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) |  | ||||||
|  |  | ||||||
|         match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) |  | ||||||
|         if match is None: |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "Could not increase elements per page (table id not found)." |  | ||||||
|                 " Some might not be crawled!" |  | ||||||
|             ) |  | ||||||
|             return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) |  | ||||||
|         table_id = match.group(1) |  | ||||||
|  |  | ||||||
|         extended_video_page = self._get_page( |  | ||||||
|             second_stage_url, |  | ||||||
|             {f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         if self._is_paginated_video_page(extended_video_page): |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "800 elements do not seem to be enough (or I failed to fetch that many)." |  | ||||||
|                 " I will miss elements." |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page) |  | ||||||
|  |  | ||||||
|     def _crawl_video_directory_second_stage( |  | ||||||
|             self, |  | ||||||
|             video_dir_path: Path, |  | ||||||
|             video_list_soup: bs4.BeautifulSoup |  | ||||||
|     ) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawls the "second stage" video page. This page contains the actual video urls. |  | ||||||
|         """ |  | ||||||
|         direct_download_links: List[bs4.Tag] = video_list_soup.findAll( |  | ||||||
|             name="a", text=re.compile(r"\s*Download\s*") |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         # Video start links are marked with an "Abspielen" link |  | ||||||
|         video_links: List[bs4.Tag] = video_list_soup.findAll( |  | ||||||
|             name="a", text=re.compile(r"\s*Abspielen\s*") |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         results: List[IliasCrawlerEntry] = [] |  | ||||||
|  |  | ||||||
|         # We can download everything directly! |  | ||||||
|         # FIXME: Sadly the download button is currently broken, so never do that |  | ||||||
|         if False and len(direct_download_links) == len(video_links): |  | ||||||
|             for link in direct_download_links: |  | ||||||
|                 results += self._crawl_single_video(video_dir_path, link, True) |  | ||||||
|         else: |  | ||||||
|             for link in video_links: |  | ||||||
|                 results += self._crawl_single_video(video_dir_path, link, False) |  | ||||||
|  |  | ||||||
|         return results |  | ||||||
|  |  | ||||||
|     def _crawl_single_video( |  | ||||||
|             self, |  | ||||||
|             parent_path: Path, |  | ||||||
|             link: bs4.Tag, |  | ||||||
|             direct_download: bool |  | ||||||
|     ) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl a single video based on its "Abspielen" link from the video listing. |  | ||||||
|         """ |  | ||||||
|         # The link is part of a table with multiple columns, describing metadata. |  | ||||||
|         # 6th child (1 indexed) is the modification time string |  | ||||||
|         modification_string = link.parent.parent.parent.select_one( |  | ||||||
|             "td.std:nth-child(6)" |  | ||||||
|         ).getText().strip() |  | ||||||
|         modification_time = datetime.datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") |  | ||||||
|  |  | ||||||
|         title = link.parent.parent.parent.select_one( |  | ||||||
|             "td.std:nth-child(3)" |  | ||||||
|         ).getText().strip() |  | ||||||
|         title += ".mp4" |  | ||||||
|  |  | ||||||
|         video_path: Path = Path(parent_path, _sanitize_path_name(title)) |  | ||||||
|  |  | ||||||
|         video_url = self._abs_url_from_link(link) |  | ||||||
|  |  | ||||||
|         # The video had a direct download button we can use instead |  | ||||||
|         if direct_download: |  | ||||||
|             LOGGER.debug("Using direct download for video %r", str(video_path)) |  | ||||||
|             return [IliasCrawlerEntry( |  | ||||||
|                 video_path, video_url, IliasElementType.VIDEO_FILE, modification_time |  | ||||||
|             )] |  | ||||||
|  |  | ||||||
|         return [IliasCrawlerEntry( |  | ||||||
|             video_path, |  | ||||||
|             self._crawl_video_url_from_play_link(video_url), |  | ||||||
|             IliasElementType.VIDEO_FILE, |  | ||||||
|             modification_time |  | ||||||
|         )] |  | ||||||
|  |  | ||||||
|     def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]: |  | ||||||
|         def inner() -> Optional[str]: |  | ||||||
|             # Fetch the actual video page. This is a small wrapper page initializing a javscript |  | ||||||
|             # player. Sadly we can not execute that JS. The actual video stream url is nowhere |  | ||||||
|             # on the page, but defined in a JS object inside a script tag, passed to the player |  | ||||||
|             # library. |  | ||||||
|             # We do the impossible and RegEx the stream JSON object out of the page's HTML source |  | ||||||
|             video_page_soup = soupify(self._session.get(play_url)) |  | ||||||
|             regex: re.Pattern = re.compile( |  | ||||||
|                 r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE |  | ||||||
|             ) |  | ||||||
|             json_match = regex.search(str(video_page_soup)) |  | ||||||
|  |  | ||||||
|             if json_match is None: |  | ||||||
|                 PRETTY.warning(f"Could not find json stream info for {play_url!r}") |  | ||||||
|                 return None |  | ||||||
|             json_str = json_match.group(1) |  | ||||||
|  |  | ||||||
|             # parse it |  | ||||||
|             json_object = json.loads(json_str) |  | ||||||
|             # and fetch the video url! |  | ||||||
|             video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] |  | ||||||
|             return video_url |  | ||||||
|         return inner |  | ||||||
|  |  | ||||||
|     def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl files offered for download in exercises. |  | ||||||
|         """ |  | ||||||
|         soup = self._get_page(url, {}) |  | ||||||
|  |  | ||||||
|         results: List[IliasCrawlerEntry] = [] |  | ||||||
|  |  | ||||||
|         # Each assignment is in an accordion container |  | ||||||
|         assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer") |  | ||||||
|  |  | ||||||
|         for container in assignment_containers: |  | ||||||
|             # Fetch the container name out of the header to use it in the path |  | ||||||
|             container_name = container.select_one(".ilAssignmentHeader").getText().strip() |  | ||||||
|             # Find all download links in the container (this will contain all the files) |  | ||||||
|             files: List[bs4.Tag] = container.findAll( |  | ||||||
|                 name="a", |  | ||||||
|                 # download links contain the given command class |  | ||||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, |  | ||||||
|                 text="Download" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|             LOGGER.debug("Found exercise container %r", container_name) |  | ||||||
|  |  | ||||||
|             # Grab each file as you now have the link |  | ||||||
|             for file_link in files: |  | ||||||
|                 # Two divs, side by side. Left is the name, right is the link ==> get left |  | ||||||
|                 # sibling |  | ||||||
|                 file_name = file_link.parent.findPrevious(name="div").getText().strip() |  | ||||||
|                 file_name = _sanitize_path_name(file_name) |  | ||||||
|                 url = self._abs_url_from_link(file_link) |  | ||||||
|  |  | ||||||
|                 LOGGER.debug("Found file %r at %r", file_name, url) |  | ||||||
|  |  | ||||||
|                 results.append(IliasCrawlerEntry( |  | ||||||
|                     Path(element_path, container_name, file_name), |  | ||||||
|                     url, |  | ||||||
|                     IliasElementType.REGULAR_FILE, |  | ||||||
|                     None  # We do not have any timestamp |  | ||||||
|                 )) |  | ||||||
|  |  | ||||||
|         return results |  | ||||||
|  |  | ||||||
|     def _get_page(self, url: str, params: Dict[str, Any], |  | ||||||
|                   retry_count: int = 0) -> bs4.BeautifulSoup: |  | ||||||
|         """ |  | ||||||
|         Fetches a page from ILIAS, authenticating when needed. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if retry_count >= 4: |  | ||||||
|             raise FatalException("Could not get a proper page after 4 tries. " |  | ||||||
|                                  "Maybe your URL is wrong, authentication fails continuously, " |  | ||||||
|                                  "your ILIAS connection is spotty or ILIAS is not well.") |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Fetching %r", url) |  | ||||||
|  |  | ||||||
|         response = self._session.get(url, params=params) |  | ||||||
|         content_type = response.headers["content-type"] |  | ||||||
|  |  | ||||||
|         if not content_type.startswith("text/html"): |  | ||||||
|             raise FatalException( |  | ||||||
|                 f"Invalid content type {content_type} when crawling ilias page" |  | ||||||
|                 " {url!r} with {params!r}" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         soup = soupify(response) |  | ||||||
|  |  | ||||||
|         if self._is_logged_in(soup): |  | ||||||
|             return soup |  | ||||||
|  |  | ||||||
|         LOGGER.info("Not authenticated, changing that...") |  | ||||||
|  |  | ||||||
|         self._authenticator.authenticate(self._session) |  | ||||||
|  |  | ||||||
|         return self._get_page(url, params, retry_count + 1) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         # Normal ILIAS pages |  | ||||||
|         userlog = soup.find("li", {"id": "userlog"}) |  | ||||||
|         if userlog is not None: |  | ||||||
|             LOGGER.debug("Auth: Found #userlog") |  | ||||||
|             return True |  | ||||||
|         # Video listing embeds do not have complete ILIAS html. Try to match them by |  | ||||||
|         # their video listing table |  | ||||||
|         video_table = soup.find( |  | ||||||
|             recursive=True, |  | ||||||
|             name="table", |  | ||||||
|             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} |  | ||||||
|         ) |  | ||||||
|         if video_table is not None: |  | ||||||
|             LOGGER.debug("Auth: Found #tbl_xoct.+") |  | ||||||
|             return True |  | ||||||
|         # The individual video player wrapper page has nothing of the above. |  | ||||||
|         # Match it by its playerContainer. |  | ||||||
|         if soup.select_one("#playerContainer") is not None: |  | ||||||
|             LOGGER.debug("Auth: Found #playerContainer") |  | ||||||
|             return True |  | ||||||
|         return False |  | ||||||
| @@ -1,51 +0,0 @@ | |||||||
| """ |  | ||||||
| Helper methods to demangle an ILIAS date. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import datetime |  | ||||||
| import locale |  | ||||||
| import logging |  | ||||||
| import re |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| from ..logging import PrettyLogger |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def demangle_date(date: str) -> Optional[datetime.datetime]: |  | ||||||
|     """ |  | ||||||
|     Demangle a given date in one of the following formats: |  | ||||||
|     "Gestern, HH:MM" |  | ||||||
|     "Heute, HH:MM" |  | ||||||
|     "Morgen, HH:MM" |  | ||||||
|     "dd. mon.yyyy, HH:MM |  | ||||||
|     """ |  | ||||||
|     saved = locale.setlocale(locale.LC_ALL) |  | ||||||
|     try: |  | ||||||
|         try: |  | ||||||
|             locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') |  | ||||||
|         except locale.Error: |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "Could not set language to german. Assuming you use english everywhere." |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         date = re.sub(r"\s+", " ", date) |  | ||||||
|         date = re.sub("Gestern|Yesterday", _yesterday().strftime("%d. %b %Y"), date, re.I) |  | ||||||
|         date = re.sub("Heute|Today", datetime.date.today().strftime("%d. %b %Y"), date, re.I) |  | ||||||
|         date = re.sub("Morgen|Tomorrow", _tomorrow().strftime("%d. %b %Y"), date, re.I) |  | ||||||
|         return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M") |  | ||||||
|     except ValueError: |  | ||||||
|         PRETTY.warning(f"Could not parse date {date!r}") |  | ||||||
|         return None |  | ||||||
|     finally: |  | ||||||
|         locale.setlocale(locale.LC_ALL, saved) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _yesterday() -> datetime.date: |  | ||||||
|     return datetime.date.today() - datetime.timedelta(days=1) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _tomorrow() -> datetime.date: |  | ||||||
|     return datetime.date.today() + datetime.timedelta(days=1) |  | ||||||
| @@ -1,162 +0,0 @@ | |||||||
| """Contains a downloader for ILIAS.""" |  | ||||||
|  |  | ||||||
| import datetime |  | ||||||
| import logging |  | ||||||
| import math |  | ||||||
| import os |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
| from typing import Callable, List, Optional, Union |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from ..logging import PrettyLogger |  | ||||||
| from ..organizer import Organizer |  | ||||||
| from ..tmp_dir import TmpDir |  | ||||||
| from ..transform import Transformable |  | ||||||
| from ..utils import soupify, stream_to_path |  | ||||||
| from .authenticators import IliasAuthenticator |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ContentTypeException(Exception): |  | ||||||
|     """Thrown when the content type of the ilias element can not be handled.""" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     This class describes a single file to be downloaded. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             path: PurePath, |  | ||||||
|             url: Union[str, Callable[[], Optional[str]]], |  | ||||||
|             modifcation_date: Optional[datetime.datetime] |  | ||||||
|     ): |  | ||||||
|         super().__init__(path) |  | ||||||
|         if isinstance(url, str): |  | ||||||
|             string_url = url |  | ||||||
|             self.url: Callable[[], Optional[str]] = lambda: string_url |  | ||||||
|         else: |  | ||||||
|             self.url = url |  | ||||||
|         self.modification_date = modifcation_date |  | ||||||
|  |  | ||||||
|  |  | ||||||
| IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def download_everything(organizer: Organizer, info: IliasDownloadInfo) -> bool: |  | ||||||
|     # pylint: disable=unused-argument |  | ||||||
|     """ |  | ||||||
|     Accepts everything. |  | ||||||
|     """ |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def download_modified_or_new(organizer: Organizer, info: IliasDownloadInfo) -> bool: |  | ||||||
|     """ |  | ||||||
|     Accepts new files or files with a more recent modification date. |  | ||||||
|     """ |  | ||||||
|     resolved_file = organizer.resolve(info.path) |  | ||||||
|     if not resolved_file.exists() or info.modification_date is None: |  | ||||||
|         return True |  | ||||||
|     resolved_mod_time_seconds = resolved_file.stat().st_mtime |  | ||||||
|  |  | ||||||
|     # Download if the info is newer |  | ||||||
|     if info.modification_date.timestamp() > resolved_mod_time_seconds: |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     PRETTY.ignored_file(info.path, "local file has newer or equal modification time") |  | ||||||
|     return False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasDownloader: |  | ||||||
|     # pylint: disable=too-many-arguments |  | ||||||
|     """A downloader for ILIAS.""" |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             tmp_dir: TmpDir, |  | ||||||
|             organizer: Organizer, |  | ||||||
|             session: requests.Session, |  | ||||||
|             authenticator: IliasAuthenticator, |  | ||||||
|             strategy: IliasDownloadStrategy, |  | ||||||
|             timeout: int = 5 |  | ||||||
|     ): |  | ||||||
|         """ |  | ||||||
|         Create a new IliasDownloader. |  | ||||||
|  |  | ||||||
|         The timeout applies to the download request only, as bwcloud uses IPv6 |  | ||||||
|         and requests has a problem with that: https://github.com/psf/requests/issues/5522 |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._session = session |  | ||||||
|         self._authenticator = authenticator |  | ||||||
|         self._strategy = strategy |  | ||||||
|         self._timeout = timeout |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[IliasDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: IliasDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a file from ILIAS. |  | ||||||
|  |  | ||||||
|         Retries authentication until eternity if it could not fetch the file. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Downloading %r", info) |  | ||||||
|         if not self._strategy(self._organizer, info): |  | ||||||
|             self._organizer.mark(info.path) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         tmp_file = self._tmp_dir.new_path() |  | ||||||
|  |  | ||||||
|         while not self._try_download(info, tmp_file): |  | ||||||
|             LOGGER.info("Retrying download: %r", info) |  | ||||||
|             self._authenticator.authenticate(self._session) |  | ||||||
|  |  | ||||||
|         dst_path = self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|         if dst_path and info.modification_date: |  | ||||||
|             os.utime( |  | ||||||
|                 dst_path, |  | ||||||
|                 times=( |  | ||||||
|                     math.ceil(info.modification_date.timestamp()), |  | ||||||
|                     math.ceil(info.modification_date.timestamp()) |  | ||||||
|                 ) |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: |  | ||||||
|         url = info.url() |  | ||||||
|         if url is None: |  | ||||||
|             PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") |  | ||||||
|             return True |  | ||||||
|  |  | ||||||
|         with self._session.get(url, stream=True, timeout=self._timeout) as response: |  | ||||||
|             content_type = response.headers["content-type"] |  | ||||||
|             has_content_disposition = "content-disposition" in response.headers |  | ||||||
|  |  | ||||||
|             if content_type.startswith("text/html") and not has_content_disposition: |  | ||||||
|                 if self._is_logged_in(soupify(response)): |  | ||||||
|                     raise ContentTypeException("Attempting to download a web page, not a file") |  | ||||||
|  |  | ||||||
|                 return False |  | ||||||
|  |  | ||||||
|             # Yay, we got the file :) |  | ||||||
|             stream_to_path(response, target, info.path.name) |  | ||||||
|             return True |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         userlog = soup.find("li", {"id": "userlog"}) |  | ||||||
|         return userlog is not None |  | ||||||
							
								
								
									
										151
									
								
								PFERD/ipd.py
									
									
									
									
									
								
							
							
						
						
									
										151
									
								
								PFERD/ipd.py
									
									
									
									
									
								
							| @@ -1,151 +0,0 @@ | |||||||
| """ |  | ||||||
| Utility functions and a scraper/downloader for the IPD pages. |  | ||||||
| """ |  | ||||||
| import datetime |  | ||||||
| import logging |  | ||||||
| import math |  | ||||||
| import os |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Callable, List, Optional |  | ||||||
| from urllib.parse import urljoin |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from PFERD.errors import FatalException |  | ||||||
| from PFERD.utils import soupify |  | ||||||
|  |  | ||||||
| from .logging import PrettyLogger |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import Transformable |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class IpdDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     Information about an ipd entry. |  | ||||||
|     """ |  | ||||||
|     url: str |  | ||||||
|     modification_date: Optional[datetime.datetime] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| IpdDownloadStrategy = Callable[[Organizer, IpdDownloadInfo], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def ipd_download_new_or_modified(organizer: Organizer, info: IpdDownloadInfo) -> bool: |  | ||||||
|     """ |  | ||||||
|     Accepts new files or files with a more recent modification date. |  | ||||||
|     """ |  | ||||||
|     resolved_file = organizer.resolve(info.path) |  | ||||||
|     if not resolved_file.exists(): |  | ||||||
|         return True |  | ||||||
|     if not info.modification_date: |  | ||||||
|         PRETTY.ignored_file(info.path, "could not find modification time, file exists") |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     resolved_mod_time_seconds = resolved_file.stat().st_mtime |  | ||||||
|  |  | ||||||
|     # Download if the info is newer |  | ||||||
|     if info.modification_date.timestamp() > resolved_mod_time_seconds: |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     PRETTY.ignored_file(info.path, "local file has newer or equal modification time") |  | ||||||
|     return False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IpdCrawler: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     A crawler for IPD pages. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, base_url: str): |  | ||||||
|         self._base_url = base_url |  | ||||||
|  |  | ||||||
|     def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: |  | ||||||
|         """ |  | ||||||
|         Create an absolute url from an <a> tag. |  | ||||||
|         """ |  | ||||||
|         return urljoin(self._base_url, link_tag.get("href")) |  | ||||||
|  |  | ||||||
|     def crawl(self) -> List[IpdDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls the playlist given in the constructor. |  | ||||||
|         """ |  | ||||||
|         page = soupify(requests.get(self._base_url)) |  | ||||||
|  |  | ||||||
|         items: List[IpdDownloadInfo] = [] |  | ||||||
|  |  | ||||||
|         for link in page.findAll(name="a", attrs={"href": lambda x: x and x.endswith("pdf")}): |  | ||||||
|             href: str = link.attrs.get("href") |  | ||||||
|             name = href.split("/")[-1] |  | ||||||
|  |  | ||||||
|             modification_date: Optional[datetime.datetime] = None |  | ||||||
|             try: |  | ||||||
|                 enclosing_row: bs4.Tag = link.findParent(name="tr") |  | ||||||
|                 if enclosing_row: |  | ||||||
|                     date_text = enclosing_row.find(name="td").text |  | ||||||
|                     modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") |  | ||||||
|             except ValueError: |  | ||||||
|                 modification_date = None |  | ||||||
|  |  | ||||||
|             items.append(IpdDownloadInfo( |  | ||||||
|                 Path(name), |  | ||||||
|                 url=self._abs_url_from_link(link), |  | ||||||
|                 modification_date=modification_date |  | ||||||
|             )) |  | ||||||
|  |  | ||||||
|         return items |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IpdDownloader: |  | ||||||
|     """ |  | ||||||
|     A downloader for ipd files. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy): |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._strategy = strategy |  | ||||||
|         self._session = requests.session() |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[IpdDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: IpdDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a single file. |  | ||||||
|         """ |  | ||||||
|         if not self._strategy(self._organizer, info): |  | ||||||
|             self._organizer.mark(info.path) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         with self._session.get(info.url, stream=True) as response: |  | ||||||
|             if response.status_code == 200: |  | ||||||
|                 tmp_file = self._tmp_dir.new_path() |  | ||||||
|                 stream_to_path(response, tmp_file, info.path.name) |  | ||||||
|                 dst_path = self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|  |  | ||||||
|                 if dst_path and info.modification_date: |  | ||||||
|                     os.utime( |  | ||||||
|                         dst_path, |  | ||||||
|                         times=( |  | ||||||
|                             math.ceil(info.modification_date.timestamp()), |  | ||||||
|                             math.ceil(info.modification_date.timestamp()) |  | ||||||
|                         ) |  | ||||||
|                     ) |  | ||||||
|  |  | ||||||
|             elif response.status_code == 403: |  | ||||||
|                 raise FatalException("Received 403. Are you not using the KIT VPN?") |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Could not download file, got response {response.status_code}") |  | ||||||
							
								
								
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | |||||||
|  | import asyncio | ||||||
|  | import time | ||||||
|  | from contextlib import asynccontextmanager | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from typing import AsyncIterator, Optional | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Slot: | ||||||
|  |     active: bool = False | ||||||
|  |     last_left: Optional[float] = None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Limiter: | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             task_limit: int, | ||||||
|  |             download_limit: int, | ||||||
|  |             task_delay: float | ||||||
|  |     ): | ||||||
|  |         if task_limit <= 0: | ||||||
|  |             raise ValueError("task limit must be at least 1") | ||||||
|  |         if download_limit <= 0: | ||||||
|  |             raise ValueError("download limit must be at least 1") | ||||||
|  |         if download_limit > task_limit: | ||||||
|  |             raise ValueError("download limit can't be greater than task limit") | ||||||
|  |         if task_delay < 0: | ||||||
|  |             raise ValueError("Task delay must not be negative") | ||||||
|  |  | ||||||
|  |         self._slots = [Slot() for _ in range(task_limit)] | ||||||
|  |         self._downloads = download_limit | ||||||
|  |         self._delay = task_delay | ||||||
|  |  | ||||||
|  |         self._condition = asyncio.Condition() | ||||||
|  |  | ||||||
|  |     def _acquire_slot(self) -> Optional[Slot]: | ||||||
|  |         for slot in self._slots: | ||||||
|  |             if not slot.active: | ||||||
|  |                 slot.active = True | ||||||
|  |                 return slot | ||||||
|  |  | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     async def _wait_for_slot_delay(self, slot: Slot) -> None: | ||||||
|  |         if slot.last_left is not None: | ||||||
|  |             delay = slot.last_left + self._delay - time.time() | ||||||
|  |             if delay > 0: | ||||||
|  |                 await asyncio.sleep(delay) | ||||||
|  |  | ||||||
|  |     def _release_slot(self, slot: Slot) -> None: | ||||||
|  |         slot.last_left = time.time() | ||||||
|  |         slot.active = False | ||||||
|  |  | ||||||
|  |     @asynccontextmanager | ||||||
|  |     async def limit_crawl(self) -> AsyncIterator[None]: | ||||||
|  |         slot: Slot | ||||||
|  |         async with self._condition: | ||||||
|  |             while True: | ||||||
|  |                 if found_slot := self._acquire_slot(): | ||||||
|  |                     slot = found_slot | ||||||
|  |                     break | ||||||
|  |                 await self._condition.wait() | ||||||
|  |  | ||||||
|  |         await self._wait_for_slot_delay(slot) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             yield | ||||||
|  |         finally: | ||||||
|  |             async with self._condition: | ||||||
|  |                 self._release_slot(slot) | ||||||
|  |                 self._condition.notify_all() | ||||||
|  |  | ||||||
|  |     @asynccontextmanager | ||||||
|  |     async def limit_download(self) -> AsyncIterator[None]: | ||||||
|  |         slot: Slot | ||||||
|  |         async with self._condition: | ||||||
|  |             while True: | ||||||
|  |                 if self._downloads <= 0: | ||||||
|  |                     await self._condition.wait() | ||||||
|  |                     continue | ||||||
|  |  | ||||||
|  |                 if found_slot := self._acquire_slot(): | ||||||
|  |                     slot = found_slot | ||||||
|  |                     self._downloads -= 1 | ||||||
|  |                     break | ||||||
|  |  | ||||||
|  |                 await self._condition.wait() | ||||||
|  |  | ||||||
|  |         await self._wait_for_slot_delay(slot) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             yield | ||||||
|  |         finally: | ||||||
|  |             async with self._condition: | ||||||
|  |                 self._release_slot(slot) | ||||||
|  |                 self._downloads += 1 | ||||||
|  |                 self._condition.notify_all() | ||||||
| @@ -1,41 +0,0 @@ | |||||||
| """ |  | ||||||
| Contains a Location class for objects with an inherent path. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ResolveException(Exception): |  | ||||||
|     """An exception while resolving a file.""" |  | ||||||
|     # TODO take care of this when doing exception handling |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class Location: |  | ||||||
|     """ |  | ||||||
|     An object that has an inherent path. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, path: Path): |  | ||||||
|         self._path = path.resolve() |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def path(self) -> Path: |  | ||||||
|         """ |  | ||||||
|         This object's location. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         return self._path |  | ||||||
|  |  | ||||||
|     def resolve(self, target: PurePath) -> Path: |  | ||||||
|         """ |  | ||||||
|         Resolve a file relative to the path of this location. |  | ||||||
|  |  | ||||||
|         Raises a [ResolveException] if the file is outside the given directory. |  | ||||||
|         """ |  | ||||||
|         absolute_path = self.path.joinpath(target).resolve() |  | ||||||
|  |  | ||||||
|         # TODO Make this less inefficient |  | ||||||
|         if self.path not in absolute_path.parents: |  | ||||||
|             raise ResolveException(f"Path {target} is not inside directory {self.path}") |  | ||||||
|  |  | ||||||
|         return absolute_path |  | ||||||
							
								
								
									
										377
									
								
								PFERD/logging.py
									
									
									
									
									
								
							
							
						
						
									
										377
									
								
								PFERD/logging.py
									
									
									
									
									
								
							| @@ -1,187 +1,290 @@ | |||||||
| """ | import asyncio | ||||||
| Contains a few logger utility functions and implementations. | import sys | ||||||
| """ | import traceback | ||||||
|  | from contextlib import AbstractContextManager, asynccontextmanager, contextmanager | ||||||
|  | from typing import AsyncIterator, Iterator, List, Optional | ||||||
|  |  | ||||||
| import logging | from rich.console import Console, Group | ||||||
| from pathlib import Path | from rich.live import Live | ||||||
| from typing import List, Optional | from rich.markup import escape | ||||||
|  |  | ||||||
| from rich import print as rich_print |  | ||||||
| from rich._log_render import LogRender |  | ||||||
| from rich.console import Console |  | ||||||
| from rich.panel import Panel | from rich.panel import Panel | ||||||
| from rich.style import Style | from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, | ||||||
| from rich.text import Text |                            TransferSpeedColumn) | ||||||
| from rich.theme import Theme | from rich.table import Column | ||||||
|  |  | ||||||
| from .download_summary import DownloadSummary |  | ||||||
| from .utils import PathLike, to_path |  | ||||||
|  |  | ||||||
| STYLE = "{" |  | ||||||
| FORMAT = "[{levelname:<7}] {message}" |  | ||||||
| DATE_FORMAT = "%F %T" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def enable_logging(name: str = "PFERD", level: int = logging.INFO) -> None: | class ProgressBar: | ||||||
|     """ |     def __init__(self, progress: Progress, taskid: TaskID): | ||||||
|     Enable and configure logging via the logging module. |         self._progress = progress | ||||||
|     """ |         self._taskid = taskid | ||||||
|  |  | ||||||
|     logger = logging.getLogger(name) |     def advance(self, amount: float = 1) -> None: | ||||||
|     logger.setLevel(level) |         self._progress.advance(self._taskid, advance=amount) | ||||||
|     logger.addHandler(RichLoggingHandler(level=level)) |  | ||||||
|  |  | ||||||
|     # This should be logged by our own handler, and not the root logger's |     def set_total(self, total: float) -> None: | ||||||
|     # default handler, so we don't pass it on to the root logger. |         self._progress.update(self._taskid, total=total) | ||||||
|     logger.propagate = False |         self._progress.start_task(self._taskid) | ||||||
|  |  | ||||||
|  |  | ||||||
| class RichLoggingHandler(logging.Handler): | class Log: | ||||||
|     """ |     STATUS_WIDTH = 11 | ||||||
|     A logging handler that uses rich for highlighting |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, level: int) -> None: |     def __init__(self) -> None: | ||||||
|         super().__init__(level=level) |         self.console = Console(highlight=False) | ||||||
|         self.console = Console(theme=Theme({ |  | ||||||
|             "logging.level.warning": Style(color="yellow") |  | ||||||
|         })) |  | ||||||
|         self._log_render = LogRender(show_level=True, show_time=False, show_path=False) |  | ||||||
|  |  | ||||||
|     def emit(self, record: logging.LogRecord) -> None: |         self._crawl_progress = Progress( | ||||||
|         """ |             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||||
|         Invoked by logging. |             BarColumn(), | ||||||
|         """ |             TimeRemainingColumn(), | ||||||
|         log_style = f"logging.level.{record.levelname.lower()}" |             expand=True, | ||||||
|         message = self.format(record) |  | ||||||
|  |  | ||||||
|         level = Text() |  | ||||||
|         level.append(record.levelname, log_style) |  | ||||||
|         message_text = Text.from_markup(message) |  | ||||||
|  |  | ||||||
|         self.console.print( |  | ||||||
|             self._log_render( |  | ||||||
|                 self.console, |  | ||||||
|                 [message_text], |  | ||||||
|                 level=level, |  | ||||||
|         ) |         ) | ||||||
|  |         self._download_progress = Progress( | ||||||
|  |             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||||
|  |             TransferSpeedColumn(), | ||||||
|  |             DownloadColumn(), | ||||||
|  |             BarColumn(), | ||||||
|  |             TimeRemainingColumn(), | ||||||
|  |             expand=True, | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |         self._live = Live(console=self.console, transient=True) | ||||||
|  |         self._update_live() | ||||||
|  |  | ||||||
| class PrettyLogger: |         self._showing_progress = False | ||||||
|  |         self._progress_suspended = False | ||||||
|  |         self._lock = asyncio.Lock() | ||||||
|  |         self._lines: List[str] = [] | ||||||
|  |  | ||||||
|  |         # Whether different parts of the output are enabled or disabled | ||||||
|  |         self.output_explain = False | ||||||
|  |         self.output_status = True | ||||||
|  |         self.output_not_deleted = True | ||||||
|  |         self.output_report = True | ||||||
|  |  | ||||||
|  |     def _update_live(self) -> None: | ||||||
|  |         elements = [] | ||||||
|  |         if self._crawl_progress.task_ids: | ||||||
|  |             elements.append(self._crawl_progress) | ||||||
|  |         if self._download_progress.task_ids: | ||||||
|  |             elements.append(self._download_progress) | ||||||
|  |  | ||||||
|  |         group = Group(*elements) | ||||||
|  |         self._live.update(group) | ||||||
|  |  | ||||||
|  |     @contextmanager | ||||||
|  |     def show_progress(self) -> Iterator[None]: | ||||||
|  |         if self._showing_progress: | ||||||
|  |             raise RuntimeError("Calling 'show_progress' while already showing progress") | ||||||
|  |  | ||||||
|  |         self._showing_progress = True | ||||||
|  |         try: | ||||||
|  |             with self._live: | ||||||
|  |                 yield | ||||||
|  |         finally: | ||||||
|  |             self._showing_progress = False | ||||||
|  |  | ||||||
|  |     @asynccontextmanager | ||||||
|  |     async def exclusive_output(self) -> AsyncIterator[None]: | ||||||
|  |         if not self._showing_progress: | ||||||
|  |             raise RuntimeError("Calling 'exclusive_output' while not showing progress") | ||||||
|  |  | ||||||
|  |         async with self._lock: | ||||||
|  |             self._progress_suspended = True | ||||||
|  |             self._live.stop() | ||||||
|  |             try: | ||||||
|  |                 yield | ||||||
|  |             finally: | ||||||
|  |                 self._live.start() | ||||||
|  |                 self._progress_suspended = False | ||||||
|  |                 for line in self._lines: | ||||||
|  |                     self.print(line) | ||||||
|  |                 self._lines = [] | ||||||
|  |  | ||||||
|  |     def unlock(self) -> None: | ||||||
|         """ |         """ | ||||||
|     A logger that prints some specially formatted log messages in color. |         Get rid of an exclusive output state. | ||||||
|  |  | ||||||
|  |         This function is meant to let PFERD print log messages after the event | ||||||
|  |         loop was forcibly stopped and if it will not be started up again. After | ||||||
|  |         this is called, it is not safe to use any functions except the logging | ||||||
|  |         functions (print, warn, ...). | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|     def __init__(self, logger: logging.Logger) -> None: |         self._progress_suspended = False | ||||||
|         self.logger = logger |         for line in self._lines: | ||||||
|  |             self.print(line) | ||||||
|  |  | ||||||
|     @staticmethod |     def print(self, text: str) -> None: | ||||||
|     def _format_path(path: PathLike) -> str: |  | ||||||
|         return repr(str(to_path(path))) |  | ||||||
|  |  | ||||||
|     def error(self, message: str) -> None: |  | ||||||
|         """ |         """ | ||||||
|         Print an error message indicating some operation fatally failed. |         Print a normal message. Allows markup. | ||||||
|         """ |  | ||||||
|         self.logger.error( |  | ||||||
|             f"[bold red]{message}[/bold red]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def warning(self, message: str) -> None: |  | ||||||
|         """ |  | ||||||
|         Print a warning message indicating some operation failed, but the error can be recovered |  | ||||||
|         or ignored. |  | ||||||
|         """ |  | ||||||
|         self.logger.warning( |  | ||||||
|             f"[bold yellow]{message}[/bold yellow]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def modified_file(self, path: PathLike) -> None: |  | ||||||
|         """ |  | ||||||
|         An existing file has changed. |  | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.logger.info( |         if self._progress_suspended: | ||||||
|             f"[bold magenta]Modified {self._format_path(path)}.[/bold magenta]" |             self._lines.append(text) | ||||||
|         ) |         else: | ||||||
|  |             self.console.print(text) | ||||||
|  |  | ||||||
|     def new_file(self, path: PathLike) -> None: |     # TODO Print errors (and warnings?) to stderr | ||||||
|  |  | ||||||
|  |     def warn(self, text: str) -> None: | ||||||
|         """ |         """ | ||||||
|         A new file has been downloaded. |         Print a warning message. Allows no markup. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.logger.info( |         self.print(f"[bold bright_red]Warning[/] {escape(text)}") | ||||||
|             f"[bold green]Created {self._format_path(path)}.[/bold green]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def deleted_file(self, path: PathLike) -> None: |     def warn_contd(self, text: str) -> None: | ||||||
|         """ |         """ | ||||||
|         A file has been deleted. |         Print further lines of a warning message. Allows no markup. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.logger.info( |         self.print(f"{escape(text)}") | ||||||
|             f"[bold red]Deleted {self._format_path(path)}.[/bold red]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def ignored_file(self, path: PathLike, reason: str) -> None: |     def error(self, text: str) -> None: | ||||||
|         """ |         """ | ||||||
|         File was not downloaded or modified. |         Print an error message. Allows no markup. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.logger.info( |         self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") | ||||||
|             f"[dim]Ignored {self._format_path(path)} " |  | ||||||
|             f"([/dim]{reason}[dim]).[/dim]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def searching(self, path: PathLike) -> None: |     def error_contd(self, text: str) -> None: | ||||||
|         """ |         """ | ||||||
|         A crawler searches a particular object. |         Print further lines of an error message. Allows no markup. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.logger.info(f"Searching {self._format_path(path)}") |         self.print(f"[red]{escape(text)}") | ||||||
|  |  | ||||||
|     def not_searching(self, path: PathLike, reason: str) -> None: |     def unexpected_exception(self) -> None: | ||||||
|         """ |         """ | ||||||
|         A crawler does not search a particular object. |         Call this in an "except" clause to log an unexpected exception. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.logger.info( |         t, v, tb = sys.exc_info() | ||||||
|             f"[dim]Not searching {self._format_path(path)} " |         if t is None or v is None or tb is None: | ||||||
|             f"([/dim]{reason}[dim]).[/dim]" |             # We're not currently handling an exception, so somebody probably | ||||||
|         ) |             # called this function where they shouldn't. | ||||||
|  |             self.error("Something unexpected happened") | ||||||
|  |             self.error_contd("") | ||||||
|  |             for line in traceback.format_stack(): | ||||||
|  |                 self.error_contd(line[:-1])  # Without the newline | ||||||
|  |             self.error_contd("") | ||||||
|  |         else: | ||||||
|  |             self.error("An unexpected exception occurred") | ||||||
|  |             self.error_contd("") | ||||||
|  |             self.error_contd(traceback.format_exc()) | ||||||
|  |  | ||||||
|     def summary(self, download_summary: DownloadSummary) -> None: |         # Our print function doesn't take types other than strings, but the | ||||||
|  |         # underlying rich.print function does. This call is a special case | ||||||
|  |         # anyways, and we're calling it internally, so this should be fine. | ||||||
|  |         self.print(Panel.fit(""" | ||||||
|  | Please copy your program output and send it to the PFERD maintainers, either | ||||||
|  | directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | ||||||
|  |         """.strip()))  # type: ignore | ||||||
|  |  | ||||||
|  |     def explain_topic(self, text: str) -> None: | ||||||
|         """ |         """ | ||||||
|         Prints a download summary. |         Print a top-level explain text. Allows no markup. | ||||||
|         """ |         """ | ||||||
|         self.logger.info("") |  | ||||||
|         self.logger.info("[bold cyan]Download Summary[/bold cyan]") |  | ||||||
|         if not download_summary.has_updates(): |  | ||||||
|             self.logger.info("[bold dim]Nothing changed![/bold dim]") |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         for new_file in download_summary.new_files: |         if self.output_explain: | ||||||
|             self.new_file(new_file) |             self.print(f"[yellow]{escape(text)}") | ||||||
|         for modified_file in download_summary.modified_files: |  | ||||||
|             self.modified_file(modified_file) |  | ||||||
|         for deleted_files in download_summary.deleted_files: |  | ||||||
|             self.deleted_file(deleted_files) |  | ||||||
|  |  | ||||||
|     def starting_synchronizer( |     def explain(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print an indented explain text. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_explain: | ||||||
|  |             self.print(f"  {escape(text)}") | ||||||
|  |  | ||||||
|  |     def status(self, style: str, action: str, text: str, suffix: str = "") -> None: | ||||||
|  |         """ | ||||||
|  |         Print a status update while crawling. Allows markup in the "style" | ||||||
|  |         argument which will be applied to the "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_status: | ||||||
|  |             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||||
|  |  | ||||||
|  |     def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: | ||||||
|  |         """ | ||||||
|  |         Print a message for a local only file that wasn't | ||||||
|  |         deleted while crawling. Allows markup in the "style" | ||||||
|  |         argument which will be applied to the "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_status and self.output_not_deleted: | ||||||
|  |             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||||
|  |  | ||||||
|  |     def report(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a report after crawling. Allows markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_report: | ||||||
|  |             self.print(text) | ||||||
|  |  | ||||||
|  |     def report_not_deleted(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a report for a local only file that wasn't deleted after crawling. Allows markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_report and self.output_not_deleted: | ||||||
|  |             self.print(text) | ||||||
|  |  | ||||||
|  |     @contextmanager | ||||||
|  |     def _bar( | ||||||
|             self, |             self, | ||||||
|             target_directory: PathLike, |             progress: Progress, | ||||||
|             synchronizer_name: str, |             description: str, | ||||||
|             subject: Optional[str] = None, |             total: Optional[float], | ||||||
|     ) -> None: |     ) -> Iterator[ProgressBar]: | ||||||
|  |         if total is None: | ||||||
|  |             # Indeterminate progress bar | ||||||
|  |             taskid = progress.add_task(description, start=False) | ||||||
|  |         else: | ||||||
|  |             taskid = progress.add_task(description, total=total) | ||||||
|  |         self._update_live() | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             yield ProgressBar(progress, taskid) | ||||||
|  |         finally: | ||||||
|  |             progress.remove_task(taskid) | ||||||
|  |             self._update_live() | ||||||
|  |  | ||||||
|  |     def crawl_bar( | ||||||
|  |             self, | ||||||
|  |             style: str, | ||||||
|  |             action: str, | ||||||
|  |             text: str, | ||||||
|  |             total: Optional[float] = None, | ||||||
|  |     ) -> AbstractContextManager[ProgressBar]: | ||||||
|         """ |         """ | ||||||
|         A special message marking that a synchronizer has been started. |         Allows markup in the "style" argument which will be applied to the | ||||||
|  |         "action" string. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         subject_str = f"{subject} " if subject else "" |         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|         self.logger.info("") |         description = f"{style}{action}[/] {text}" | ||||||
|         self.logger.info(( |         return self._bar(self._crawl_progress, description, total) | ||||||
|             f"[bold cyan]Synchronizing " |  | ||||||
|             f"{subject_str}to {self._format_path(target_directory)} " |     def download_bar( | ||||||
|             f"using the {synchronizer_name} synchronizer.[/bold cyan]" |             self, | ||||||
|         )) |             style: str, | ||||||
|  |             action: str, | ||||||
|  |             text: str, | ||||||
|  |             total: Optional[float] = None, | ||||||
|  |     ) -> AbstractContextManager[ProgressBar]: | ||||||
|  |         """ | ||||||
|  |         Allows markup in the "style" argument which will be applied to the | ||||||
|  |         "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |         description = f"{style}{action}[/] {text}" | ||||||
|  |         return self._bar(self._download_progress, description, total) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | log = Log() | ||||||
|   | |||||||
| @@ -1,149 +0,0 @@ | |||||||
| """A simple helper for managing downloaded files. |  | ||||||
|  |  | ||||||
| A organizer is bound to a single directory. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import filecmp |  | ||||||
| import logging |  | ||||||
| import os |  | ||||||
| import shutil |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
| from typing import List, Optional, Set |  | ||||||
|  |  | ||||||
| from .download_summary import DownloadSummary |  | ||||||
| from .location import Location |  | ||||||
| from .logging import PrettyLogger |  | ||||||
| from .utils import prompt_yes_no |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FileAcceptException(Exception): |  | ||||||
|     """An exception while accepting a file.""" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class Organizer(Location): |  | ||||||
|     """A helper for managing downloaded files.""" |  | ||||||
|  |  | ||||||
|     def __init__(self, path: Path): |  | ||||||
|         """Create a new organizer for a given path.""" |  | ||||||
|         super().__init__(path) |  | ||||||
|         self._known_files: Set[Path] = set() |  | ||||||
|  |  | ||||||
|         # Keep the root dir |  | ||||||
|         self._known_files.add(path.resolve()) |  | ||||||
|  |  | ||||||
|         self.download_summary = DownloadSummary() |  | ||||||
|  |  | ||||||
|     def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]: |  | ||||||
|         """ |  | ||||||
|         Move a file to this organizer and mark it. |  | ||||||
|  |  | ||||||
|         Returns the path the file was moved to, to allow the caller to adjust the metadata. |  | ||||||
|         As you might still need to adjust the metadata when the file was identical |  | ||||||
|         (e.g. update the timestamp), the path is also returned in this case. |  | ||||||
|         In all other cases (ignored, not overwritten, etc.) this method returns None. |  | ||||||
|         """ |  | ||||||
|         # Windows limits the path length to 260 for *some* historical reason |  | ||||||
|         # If you want longer paths, you will have to add the "\\?\" prefix in front of |  | ||||||
|         # your path... |  | ||||||
|         # See: |  | ||||||
|         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation |  | ||||||
|         if os.name == 'nt': |  | ||||||
|             src_absolute = Path("\\\\?\\" + str(src.resolve())) |  | ||||||
|             dst_absolute = Path("\\\\?\\" + str(self.resolve(dst))) |  | ||||||
|         else: |  | ||||||
|             src_absolute = src.resolve() |  | ||||||
|             dst_absolute = self.resolve(dst) |  | ||||||
|  |  | ||||||
|         if not src_absolute.exists(): |  | ||||||
|             raise FileAcceptException("Source file does not exist") |  | ||||||
|  |  | ||||||
|         if not src_absolute.is_file(): |  | ||||||
|             raise FileAcceptException("Source is a directory") |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Copying %s to %s", src_absolute, dst_absolute) |  | ||||||
|  |  | ||||||
|         if self._is_marked(dst): |  | ||||||
|             PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") |  | ||||||
|             if not prompt_yes_no(f"Overwrite file?", default=False): |  | ||||||
|                 PRETTY.ignored_file(dst_absolute, "file was written previously") |  | ||||||
|                 return None |  | ||||||
|  |  | ||||||
|         # Destination file is directory |  | ||||||
|         if dst_absolute.exists() and dst_absolute.is_dir(): |  | ||||||
|             if prompt_yes_no(f"Overwrite folder {dst_absolute} with file?", default=False): |  | ||||||
|                 shutil.rmtree(dst_absolute) |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") |  | ||||||
|                 return None |  | ||||||
|  |  | ||||||
|         # Destination file exists |  | ||||||
|         if dst_absolute.exists() and dst_absolute.is_file(): |  | ||||||
|             if filecmp.cmp(str(src_absolute), str(dst_absolute), shallow=False): |  | ||||||
|                 # Bail out, nothing more to do |  | ||||||
|                 PRETTY.ignored_file(dst_absolute, "same file contents") |  | ||||||
|                 self.mark(dst) |  | ||||||
|                 return dst_absolute |  | ||||||
|  |  | ||||||
|             self.download_summary.add_modified_file(dst_absolute) |  | ||||||
|             PRETTY.modified_file(dst_absolute) |  | ||||||
|         else: |  | ||||||
|             self.download_summary.add_new_file(dst_absolute) |  | ||||||
|             PRETTY.new_file(dst_absolute) |  | ||||||
|  |  | ||||||
|         # Create parent dir if needed |  | ||||||
|         dst_parent_dir: Path = dst_absolute.parent |  | ||||||
|         dst_parent_dir.mkdir(exist_ok=True, parents=True) |  | ||||||
|  |  | ||||||
|         # Move file |  | ||||||
|         shutil.move(str(src_absolute), str(dst_absolute)) |  | ||||||
|  |  | ||||||
|         self.mark(dst) |  | ||||||
|  |  | ||||||
|         return dst_absolute |  | ||||||
|  |  | ||||||
|     def mark(self, path: PurePath) -> None: |  | ||||||
|         """Mark a file as used so it will not get cleaned up.""" |  | ||||||
|         absolute_path = self.resolve(path) |  | ||||||
|         self._known_files.add(absolute_path) |  | ||||||
|         LOGGER.debug("Tracked %s", absolute_path) |  | ||||||
|  |  | ||||||
|     def _is_marked(self, path: PurePath) -> bool: |  | ||||||
|         """ |  | ||||||
|         Checks whether a file is marked. |  | ||||||
|         """ |  | ||||||
|         absolute_path = self.resolve(path) |  | ||||||
|         return absolute_path in self._known_files |  | ||||||
|  |  | ||||||
|     def cleanup(self) -> None: |  | ||||||
|         """Remove all untracked files in the organizer's dir.""" |  | ||||||
|         LOGGER.debug("Deleting all untracked files...") |  | ||||||
|  |  | ||||||
|         self._cleanup(self.path) |  | ||||||
|  |  | ||||||
|     def _cleanup(self, start_dir: Path) -> None: |  | ||||||
|         if not start_dir.exists(): |  | ||||||
|             return |  | ||||||
|         paths: List[Path] = list(start_dir.iterdir()) |  | ||||||
|  |  | ||||||
|         # Recursively clean paths |  | ||||||
|         for path in paths: |  | ||||||
|             if path.is_dir(): |  | ||||||
|                 self._cleanup(path) |  | ||||||
|             else: |  | ||||||
|                 if path.resolve() not in self._known_files: |  | ||||||
|                     self._delete_file_if_confirmed(path) |  | ||||||
|  |  | ||||||
|         # Delete dir if it was empty and untracked |  | ||||||
|         dir_empty = len(list(start_dir.iterdir())) == 0 |  | ||||||
|         if start_dir.resolve() not in self._known_files and dir_empty: |  | ||||||
|             start_dir.rmdir() |  | ||||||
|  |  | ||||||
|     def _delete_file_if_confirmed(self, path: Path) -> None: |  | ||||||
|         prompt = f"Do you want to delete {path}" |  | ||||||
|  |  | ||||||
|         if prompt_yes_no(prompt, False): |  | ||||||
|             self.download_summary.add_deleted_file(path) |  | ||||||
|             path.unlink() |  | ||||||
							
								
								
									
										545
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										545
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,545 @@ | |||||||
|  | import filecmp | ||||||
|  | import json | ||||||
|  | import os | ||||||
|  | import random | ||||||
|  | import shutil | ||||||
|  | import string | ||||||
|  | from contextlib import contextmanager | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from datetime import datetime | ||||||
|  | from enum import Enum | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import BinaryIO, Iterator, Optional, Tuple | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .report import Report, ReportLoadError | ||||||
|  | from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no | ||||||
|  |  | ||||||
|  | SUFFIX_CHARS = string.ascii_lowercase + string.digits | ||||||
|  | SUFFIX_LENGTH = 6 | ||||||
|  | TRIES = 5 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OutputDirError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Redownload(Enum): | ||||||
|  |     NEVER = "never" | ||||||
|  |     NEVER_SMART = "never-smart" | ||||||
|  |     ALWAYS = "always" | ||||||
|  |     ALWAYS_SMART = "always-smart" | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def from_string(string: str) -> "Redownload": | ||||||
|  |         try: | ||||||
|  |             return Redownload(string) | ||||||
|  |         except ValueError: | ||||||
|  |             raise ValueError("must be one of 'never', 'never-smart'," | ||||||
|  |                              " 'always', 'always-smart'") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OnConflict(Enum): | ||||||
|  |     PROMPT = "prompt" | ||||||
|  |     LOCAL_FIRST = "local-first" | ||||||
|  |     REMOTE_FIRST = "remote-first" | ||||||
|  |     NO_DELETE = "no-delete" | ||||||
|  |     NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def from_string(string: str) -> "OnConflict": | ||||||
|  |         try: | ||||||
|  |             return OnConflict(string) | ||||||
|  |         except ValueError: | ||||||
|  |             raise ValueError("must be one of 'prompt', 'local-first'," | ||||||
|  |                              " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Heuristics: | ||||||
|  |     etag_differs: Optional[bool] | ||||||
|  |     mtime: Optional[datetime] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FileSink: | ||||||
|  |     def __init__(self, file: BinaryIO): | ||||||
|  |         self._file = file | ||||||
|  |         self._done = False | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def file(self) -> BinaryIO: | ||||||
|  |         return self._file | ||||||
|  |  | ||||||
|  |     def done(self) -> None: | ||||||
|  |         self._done = True | ||||||
|  |  | ||||||
|  |     def is_done(self) -> bool: | ||||||
|  |         return self._done | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class DownloadInfo: | ||||||
|  |     remote_path: PurePath | ||||||
|  |     path: PurePath | ||||||
|  |     local_path: Path | ||||||
|  |     tmp_path: Path | ||||||
|  |     heuristics: Heuristics | ||||||
|  |     on_conflict: OnConflict | ||||||
|  |     success: bool = False | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FileSinkToken(ReusableAsyncContextManager[FileSink]): | ||||||
|  |     # Whenever this class is entered, it creates a new temporary file and | ||||||
|  |     # returns a corresponding FileSink. | ||||||
|  |     # | ||||||
|  |     # When it is exited again, the file is closed and information about the | ||||||
|  |     # download handed back to the OutputDirectory. | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             output_dir: "OutputDirectory", | ||||||
|  |             remote_path: PurePath, | ||||||
|  |             path: PurePath, | ||||||
|  |             local_path: Path, | ||||||
|  |             heuristics: Heuristics, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |     ): | ||||||
|  |         super().__init__() | ||||||
|  |  | ||||||
|  |         self._output_dir = output_dir | ||||||
|  |         self._remote_path = remote_path | ||||||
|  |         self._path = path | ||||||
|  |         self._local_path = local_path | ||||||
|  |         self._heuristics = heuristics | ||||||
|  |         self._on_conflict = on_conflict | ||||||
|  |  | ||||||
|  |     async def _on_aenter(self) -> FileSink: | ||||||
|  |         tmp_path, file = await self._output_dir._create_tmp_file(self._local_path) | ||||||
|  |         sink = FileSink(file) | ||||||
|  |  | ||||||
|  |         async def after_download() -> None: | ||||||
|  |             await self._output_dir._after_download(DownloadInfo( | ||||||
|  |                 self._remote_path, | ||||||
|  |                 self._path, | ||||||
|  |                 self._local_path, | ||||||
|  |                 tmp_path, | ||||||
|  |                 self._heuristics, | ||||||
|  |                 self._on_conflict, | ||||||
|  |                 sink.is_done(), | ||||||
|  |             )) | ||||||
|  |  | ||||||
|  |         self._stack.push_async_callback(after_download) | ||||||
|  |         self._stack.enter_context(file) | ||||||
|  |  | ||||||
|  |         return sink | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OutputDirectory: | ||||||
|  |     REPORT_FILE = PurePath(".report") | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             root: Path, | ||||||
|  |             redownload: Redownload, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |     ): | ||||||
|  |         if os.name == "nt": | ||||||
|  |             # Windows limits the path length to 260 for some historical reason. | ||||||
|  |             # If you want longer paths, you will have to add the "\\?\" prefix | ||||||
|  |             # in front of your path. See: | ||||||
|  |             # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation | ||||||
|  |             self._root = Path("\\\\?\\" + str(root.absolute())) | ||||||
|  |         else: | ||||||
|  |             self._root = root | ||||||
|  |  | ||||||
|  |         self._redownload = redownload | ||||||
|  |         self._on_conflict = on_conflict | ||||||
|  |  | ||||||
|  |         self._report_path = self.resolve(self.REPORT_FILE) | ||||||
|  |         self._report = Report() | ||||||
|  |         self._prev_report: Optional[Report] = None | ||||||
|  |  | ||||||
|  |         self.register_reserved(self.REPORT_FILE) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def report(self) -> Report: | ||||||
|  |         return self._report | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def prev_report(self) -> Optional[Report]: | ||||||
|  |         return self._prev_report | ||||||
|  |  | ||||||
|  |     def prepare(self) -> None: | ||||||
|  |         log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             self._root.mkdir(parents=True, exist_ok=True) | ||||||
|  |         except OSError: | ||||||
|  |             raise OutputDirError("Failed to create base directory") | ||||||
|  |  | ||||||
|  |     def register_reserved(self, path: PurePath) -> None: | ||||||
|  |         self._report.mark_reserved(path) | ||||||
|  |  | ||||||
|  |     def resolve(self, path: PurePath) -> Path: | ||||||
|  |         """ | ||||||
|  |         May throw an OutputDirError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if ".." in path.parts: | ||||||
|  |             raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") | ||||||
|  |         if "." in path.parts: | ||||||
|  |             raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") | ||||||
|  |  | ||||||
|  |         return self._root / path | ||||||
|  |  | ||||||
|  |     def _should_download( | ||||||
|  |             self, | ||||||
|  |             local_path: Path, | ||||||
|  |             heuristics: Heuristics, | ||||||
|  |             redownload: Redownload, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |     ) -> bool: | ||||||
|  |         if not local_path.exists(): | ||||||
|  |             log.explain("No corresponding file present locally") | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         if on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             # Whatever is here, it will never be overwritten, so we don't need | ||||||
|  |             # to download the file. | ||||||
|  |             log.explain("Conflict resolution is 'local-first' and path exists") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         if not local_path.is_file(): | ||||||
|  |             # We know that there is *something* here that's not a file. | ||||||
|  |             log.explain("Non-file (probably a directory) present locally") | ||||||
|  |  | ||||||
|  |             # If on_conflict is LOCAL_FIRST or NO_DELETE, we know that it would | ||||||
|  |             # never be overwritten. It also doesn't have any relevant stats to | ||||||
|  |             # update. This means that we don't have to download the file | ||||||
|  |             # because we'd just always throw it away again. | ||||||
|  |             if on_conflict in {OnConflict.LOCAL_FIRST, OnConflict.NO_DELETE}: | ||||||
|  |                 log.explain(f"Conflict resolution is {on_conflict.value!r}") | ||||||
|  |                 return False | ||||||
|  |  | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         log.explain(f"Redownload policy is {redownload.value}") | ||||||
|  |  | ||||||
|  |         if redownload == Redownload.NEVER: | ||||||
|  |             return False | ||||||
|  |         elif redownload == Redownload.ALWAYS: | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         stat = local_path.stat() | ||||||
|  |  | ||||||
|  |         remote_newer = None | ||||||
|  |  | ||||||
|  |         # ETag should be a more reliable indicator than mtime, so we check it first | ||||||
|  |         if heuristics.etag_differs is not None: | ||||||
|  |             remote_newer = heuristics.etag_differs | ||||||
|  |             if remote_newer: | ||||||
|  |                 log.explain("Remote file's entity tag differs") | ||||||
|  |             else: | ||||||
|  |                 log.explain("Remote file's entity tag is the same") | ||||||
|  |  | ||||||
|  |         # Python on Windows crashes when faced with timestamps around the unix epoch | ||||||
|  |         if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): | ||||||
|  |             mtime = heuristics.mtime | ||||||
|  |             remote_newer = mtime.timestamp() > stat.st_mtime | ||||||
|  |             if remote_newer: | ||||||
|  |                 log.explain("Remote file seems to be newer") | ||||||
|  |             else: | ||||||
|  |                 log.explain("Remote file doesn't seem to be newer") | ||||||
|  |  | ||||||
|  |         if redownload == Redownload.NEVER_SMART: | ||||||
|  |             if remote_newer is None: | ||||||
|  |                 return False | ||||||
|  |             else: | ||||||
|  |                 return remote_newer | ||||||
|  |         elif redownload == Redownload.ALWAYS_SMART: | ||||||
|  |             if remote_newer is None: | ||||||
|  |                 return True | ||||||
|  |             else: | ||||||
|  |                 return remote_newer | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{redownload!r} is not a valid redownload policy") | ||||||
|  |  | ||||||
|  |     # The following conflict resolution functions all return False if the local | ||||||
|  |     # file(s) should be kept and True if they should be replaced by the remote | ||||||
|  |     # files. | ||||||
|  |  | ||||||
|  |     async def _conflict_lfrf( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Replace {fmt_path(path)} with remote file?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     async def _conflict_ldrf( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     async def _conflict_lfrd( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |             parent: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     async def _conflict_delete_lf( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict == OnConflict.PROMPT: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Delete {fmt_path(path)}?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     def _tmp_path(self, base: Path, suffix_length: int) -> Path: | ||||||
|  |         prefix = "" if base.name.startswith(".") else "." | ||||||
|  |         suffix = "".join(random.choices(SUFFIX_CHARS, k=suffix_length)) | ||||||
|  |         name = f"{prefix}{base.name}.tmp.{suffix}" | ||||||
|  |         return base.parent / name | ||||||
|  |  | ||||||
|  |     async def _create_tmp_file( | ||||||
|  |             self, | ||||||
|  |             local_path: Path, | ||||||
|  |     ) -> Tuple[Path, BinaryIO]: | ||||||
|  |         """ | ||||||
|  |         May raise an OutputDirError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # Create tmp file | ||||||
|  |         for attempt in range(TRIES): | ||||||
|  |             suffix_length = SUFFIX_LENGTH + 2 * attempt | ||||||
|  |             tmp_path = self._tmp_path(local_path, suffix_length) | ||||||
|  |             try: | ||||||
|  |                 return tmp_path, open(tmp_path, "xb") | ||||||
|  |             except FileExistsError: | ||||||
|  |                 pass  # Try again | ||||||
|  |  | ||||||
|  |         raise OutputDirError("Failed to create temporary file") | ||||||
|  |  | ||||||
|  |     def should_try_download( | ||||||
|  |         self, | ||||||
|  |         path: PurePath, | ||||||
|  |         *, | ||||||
|  |         etag_differs: Optional[bool] = None, | ||||||
|  |         mtime: Optional[datetime] = None, | ||||||
|  |         redownload: Optional[Redownload] = None, | ||||||
|  |         on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> bool: | ||||||
|  |         heuristics = Heuristics(etag_differs, mtime) | ||||||
|  |         redownload = self._redownload if redownload is None else redownload | ||||||
|  |         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||||
|  |         local_path = self.resolve(path) | ||||||
|  |  | ||||||
|  |         return self._should_download(local_path, heuristics, redownload, on_conflict) | ||||||
|  |  | ||||||
|  |     async def download( | ||||||
|  |             self, | ||||||
|  |             remote_path: PurePath, | ||||||
|  |             path: PurePath, | ||||||
|  |             *, | ||||||
|  |             etag_differs: Optional[bool] = None, | ||||||
|  |             mtime: Optional[datetime] = None, | ||||||
|  |             redownload: Optional[Redownload] = None, | ||||||
|  |             on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> Optional[FileSinkToken]: | ||||||
|  |         """ | ||||||
|  |         May throw an OutputDirError, a MarkDuplicateError or a | ||||||
|  |         MarkConflictError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         heuristics = Heuristics(etag_differs, mtime) | ||||||
|  |         redownload = self._redownload if redownload is None else redownload | ||||||
|  |         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||||
|  |         local_path = self.resolve(path) | ||||||
|  |  | ||||||
|  |         self._report.mark(path) | ||||||
|  |  | ||||||
|  |         if not self._should_download(local_path, heuristics, redownload, on_conflict): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         # Detect and solve local-dir-remote-file conflict | ||||||
|  |         if local_path.is_dir(): | ||||||
|  |             log.explain("Conflict: There's a directory in place of the local file") | ||||||
|  |             if await self._conflict_ldrf(on_conflict, path): | ||||||
|  |                 log.explain("Result: Delete the obstructing directory") | ||||||
|  |                 shutil.rmtree(local_path) | ||||||
|  |             else: | ||||||
|  |                 log.explain("Result: Keep the obstructing directory") | ||||||
|  |                 return None | ||||||
|  |  | ||||||
|  |         # Detect and solve local-file-remote-dir conflict | ||||||
|  |         for parent in path.parents: | ||||||
|  |             local_parent = self.resolve(parent) | ||||||
|  |             if local_parent.exists() and not local_parent.is_dir(): | ||||||
|  |                 log.explain("Conflict: One of the local file's parents is a file") | ||||||
|  |                 if await self._conflict_lfrd(on_conflict, path, parent): | ||||||
|  |                     log.explain("Result: Delete the obstructing file") | ||||||
|  |                     local_parent.unlink() | ||||||
|  |                     break | ||||||
|  |                 else: | ||||||
|  |                     log.explain("Result: Keep the obstructing file") | ||||||
|  |                     return None | ||||||
|  |  | ||||||
|  |         # Ensure parent directory exists | ||||||
|  |         local_path.parent.mkdir(parents=True, exist_ok=True) | ||||||
|  |  | ||||||
|  |         return FileSinkToken(self, remote_path, path, local_path, heuristics, on_conflict) | ||||||
|  |  | ||||||
|  |     def _update_metadata(self, info: DownloadInfo) -> None: | ||||||
|  |         if mtime := info.heuristics.mtime: | ||||||
|  |             mtimestamp = mtime.timestamp() | ||||||
|  |             os.utime(info.local_path, times=(mtimestamp, mtimestamp)) | ||||||
|  |  | ||||||
|  |     @contextmanager | ||||||
|  |     def _ensure_deleted(self, path: Path) -> Iterator[None]: | ||||||
|  |         try: | ||||||
|  |             yield | ||||||
|  |         finally: | ||||||
|  |             path.unlink(missing_ok=True) | ||||||
|  |  | ||||||
|  |     async def _after_download(self, info: DownloadInfo) -> None: | ||||||
|  |         with self._ensure_deleted(info.tmp_path): | ||||||
|  |             log.status("[bold cyan]", "Downloaded", fmt_path(info.remote_path)) | ||||||
|  |             log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") | ||||||
|  |  | ||||||
|  |             changed = False | ||||||
|  |  | ||||||
|  |             if not info.success: | ||||||
|  |                 log.explain("Download unsuccessful, aborting") | ||||||
|  |                 return | ||||||
|  |  | ||||||
|  |             # Solve conflicts arising from existing local file | ||||||
|  |             if info.local_path.exists(): | ||||||
|  |                 changed = True | ||||||
|  |  | ||||||
|  |                 if filecmp.cmp(info.local_path, info.tmp_path): | ||||||
|  |                     log.explain("Contents identical with existing file") | ||||||
|  |                     log.explain("Updating metadata of existing file") | ||||||
|  |                     self._update_metadata(info) | ||||||
|  |                     return | ||||||
|  |  | ||||||
|  |                 log.explain("Conflict: The local and remote versions differ") | ||||||
|  |                 if await self._conflict_lfrf(info.on_conflict, info.path): | ||||||
|  |                     log.explain("Result: Replacing local with remote version") | ||||||
|  |                 else: | ||||||
|  |                     log.explain("Result: Keeping local version") | ||||||
|  |                     return | ||||||
|  |  | ||||||
|  |             info.tmp_path.replace(info.local_path) | ||||||
|  |             log.explain("Updating file metadata") | ||||||
|  |             self._update_metadata(info) | ||||||
|  |  | ||||||
|  |             if changed: | ||||||
|  |                 log.status("[bold bright_yellow]", "Changed", fmt_path(info.path)) | ||||||
|  |                 self._report.change_file(info.path) | ||||||
|  |             else: | ||||||
|  |                 log.status("[bold bright_green]", "Added", fmt_path(info.path)) | ||||||
|  |                 self._report.add_file(info.path) | ||||||
|  |  | ||||||
|  |     async def cleanup(self) -> None: | ||||||
|  |         await self._cleanup_dir(self._root, PurePath(), delete_self=False) | ||||||
|  |  | ||||||
|  |     async def _cleanup(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         if path.is_dir(): | ||||||
|  |             await self._cleanup_dir(path, pure) | ||||||
|  |         elif path.is_file(): | ||||||
|  |             await self._cleanup_file(path, pure) | ||||||
|  |  | ||||||
|  |     async def _cleanup_dir(self, path: Path, pure: PurePath, delete_self: bool = True) -> None: | ||||||
|  |         for child in sorted(path.iterdir()): | ||||||
|  |             pure_child = pure / child.name | ||||||
|  |             await self._cleanup(child, pure_child) | ||||||
|  |  | ||||||
|  |         if delete_self: | ||||||
|  |             try: | ||||||
|  |                 path.rmdir() | ||||||
|  |             except OSError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |     async def _cleanup_file(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         if self._report.is_marked(pure): | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         if await self._conflict_delete_lf(self._on_conflict, pure): | ||||||
|  |             try: | ||||||
|  |                 path.unlink() | ||||||
|  |                 log.status("[bold bright_magenta]", "Deleted", fmt_path(pure)) | ||||||
|  |                 self._report.delete_file(pure) | ||||||
|  |             except OSError: | ||||||
|  |                 pass | ||||||
|  |         else: | ||||||
|  |             log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) | ||||||
|  |             self._report.not_delete_file(pure) | ||||||
|  |  | ||||||
|  |     def load_prev_report(self) -> None: | ||||||
|  |         log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") | ||||||
|  |         try: | ||||||
|  |             self._prev_report = Report.load(self._report_path) | ||||||
|  |             log.explain("Loaded report successfully") | ||||||
|  |         except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e: | ||||||
|  |             log.explain("Failed to load report") | ||||||
|  |             log.explain(str(e)) | ||||||
|  |  | ||||||
|  |     def store_report(self) -> None: | ||||||
|  |         log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}") | ||||||
|  |         try: | ||||||
|  |             self._report.store(self._report_path) | ||||||
|  |             log.explain("Stored report successfully") | ||||||
|  |         except OSError as e: | ||||||
|  |             log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}") | ||||||
|  |             log.warn_contd(str(e)) | ||||||
							
								
								
									
										544
									
								
								PFERD/pferd.py
									
									
									
									
									
								
							
							
						
						
									
										544
									
								
								PFERD/pferd.py
									
									
									
									
									
								
							| @@ -1,413 +1,199 @@ | |||||||
| """ | from pathlib import Path, PurePath | ||||||
| Convenience functions for using PFERD. | from typing import Dict, List, Optional | ||||||
| """ |  | ||||||
|  |  | ||||||
| import logging | from rich.markup import escape | ||||||
| from pathlib import Path |  | ||||||
| from typing import Callable, List, Optional, Union |  | ||||||
|  |  | ||||||
| from .cookie_jar import CookieJar | from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection | ||||||
| from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, | from .config import Config, ConfigOptionError | ||||||
|                    diva_download_new) | from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler | ||||||
| from .download_summary import DownloadSummary | from .logging import log | ||||||
| from .errors import FatalException, swallow_and_print_errors | from .utils import fmt_path | ||||||
| from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, |  | ||||||
|                     IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, |  | ||||||
|                     KitShibbolethAuthenticator, download_modified_or_new) |  | ||||||
| from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo, |  | ||||||
|                   IpdDownloadStrategy, ipd_download_new_or_modified) |  | ||||||
| from .location import Location |  | ||||||
| from .logging import PrettyLogger, enable_logging |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import TF, Transform, apply_transform |  | ||||||
| from .utils import PathLike, to_path |  | ||||||
|  |  | ||||||
| # TODO save known-good cookies as soon as possible |  | ||||||
|  |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) | class PferdLoadError(Exception): | ||||||
| PRETTY = PrettyLogger(LOGGER) |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
| class Pferd(Location): | class Pferd: | ||||||
|     # pylint: disable=too-many-arguments |     def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]): | ||||||
|         """ |         """ | ||||||
|     The main entrypoint in your Pferd usage: This class combines a number of |         May throw PferdLoadError. | ||||||
|     useful shortcuts for running synchronizers in a single interface. |  | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|     def __init__( |         self._config = config | ||||||
|             self, |         self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips) | ||||||
|             base_dir: Path, |  | ||||||
|             tmp_dir: Path = Path(".tmp"), |  | ||||||
|             test_run: bool = False |  | ||||||
|     ): |  | ||||||
|         super().__init__(Path(base_dir)) |  | ||||||
|  |  | ||||||
|         self._download_summary = DownloadSummary() |         self._authenticators: Dict[str, Authenticator] = {} | ||||||
|         self._tmp_dir = TmpDir(self.resolve(tmp_dir)) |         self._crawlers: Dict[str, Crawler] = {} | ||||||
|         self._test_run = test_run |  | ||||||
|  |  | ||||||
|     @staticmethod |     def _find_config_crawlers(self, config: Config) -> List[str]: | ||||||
|     def enable_logging() -> None: |         crawl_sections = [] | ||||||
|         """ |  | ||||||
|         Enable and configure logging via the logging module. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         enable_logging() |         for name, section in config.crawl_sections(): | ||||||
|  |             if CrawlerSection(section).skip(): | ||||||
|     @staticmethod |                 log.explain(f"Skipping {name!r}") | ||||||
|     def _print_transformables(transformables: List[TF]) -> None: |  | ||||||
|         LOGGER.info("") |  | ||||||
|         LOGGER.info("Results of the test run:") |  | ||||||
|         for transformable in transformables: |  | ||||||
|             LOGGER.info(transformable.path) |  | ||||||
|  |  | ||||||
|     def _ilias( |  | ||||||
|             self, |  | ||||||
|             target: PathLike, |  | ||||||
|             base_url: str, |  | ||||||
|             crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]], |  | ||||||
|             authenticator: IliasAuthenticator, |  | ||||||
|             cookies: Optional[PathLike], |  | ||||||
|             dir_filter: IliasDirectoryFilter, |  | ||||||
|             transform: Transform, |  | ||||||
|             download_strategy: IliasDownloadStrategy, |  | ||||||
|             timeout: int, |  | ||||||
|             clean: bool = True, |  | ||||||
|     ) -> Organizer: |  | ||||||
|         # pylint: disable=too-many-locals |  | ||||||
|         cookie_jar = CookieJar(to_path(cookies) if cookies else None) |  | ||||||
|         session = cookie_jar.create_session() |  | ||||||
|         tmp_dir = self._tmp_dir.new_subdir() |  | ||||||
|         organizer = Organizer(self.resolve(to_path(target))) |  | ||||||
|  |  | ||||||
|         crawler = IliasCrawler(base_url, session, authenticator, dir_filter) |  | ||||||
|         downloader = IliasDownloader(tmp_dir, organizer, session, |  | ||||||
|                                      authenticator, download_strategy, timeout) |  | ||||||
|  |  | ||||||
|         cookie_jar.load_cookies() |  | ||||||
|         info = crawl_function(crawler) |  | ||||||
|         cookie_jar.save_cookies() |  | ||||||
|  |  | ||||||
|         transformed = apply_transform(transform, info) |  | ||||||
|         if self._test_run: |  | ||||||
|             self._print_transformables(transformed) |  | ||||||
|             return organizer |  | ||||||
|  |  | ||||||
|         downloader.download_all(transformed) |  | ||||||
|         cookie_jar.save_cookies() |  | ||||||
|  |  | ||||||
|         if clean: |  | ||||||
|             organizer.cleanup() |  | ||||||
|  |  | ||||||
|         return organizer |  | ||||||
|  |  | ||||||
|     @swallow_and_print_errors |  | ||||||
|     def ilias_kit( |  | ||||||
|             self, |  | ||||||
|             target: PathLike, |  | ||||||
|             course_id: str, |  | ||||||
|             dir_filter: IliasDirectoryFilter = lambda x, y: True, |  | ||||||
|             transform: Transform = lambda x: x, |  | ||||||
|             cookies: Optional[PathLike] = None, |  | ||||||
|             username: Optional[str] = None, |  | ||||||
|             password: Optional[str] = None, |  | ||||||
|             download_strategy: IliasDownloadStrategy = download_modified_or_new, |  | ||||||
|             clean: bool = True, |  | ||||||
|             timeout: int = 5, |  | ||||||
|     ) -> Organizer: |  | ||||||
|         """ |  | ||||||
|         Synchronizes a folder with the ILIAS instance of the KIT. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             target {Path} -- the target path to write the data to |  | ||||||
|             course_id {str} -- the id of the main course page (found in the URL after ref_id |  | ||||||
|                 when opening the course homepage) |  | ||||||
|  |  | ||||||
|         Keyword Arguments: |  | ||||||
|             dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the |  | ||||||
|                 crawler level, these directories and all of their content is skipped. |  | ||||||
|                 (default: {lambdax:True}) |  | ||||||
|             transform {Transform} -- A transformation function for the output paths. Return None |  | ||||||
|                 to ignore a file. (default: {lambdax:x}) |  | ||||||
|             cookies {Optional[Path]} -- The path to store and load cookies from. |  | ||||||
|                 (default: {None}) |  | ||||||
|             username {Optional[str]} -- The SCC username. If none is given, it will prompt |  | ||||||
|                 the user. (default: {None}) |  | ||||||
|             password {Optional[str]} -- The SCC password. If none is given, it will prompt |  | ||||||
|                 the user. (default: {None}) |  | ||||||
|             download_strategy {DownloadStrategy} -- A function to determine which files need to |  | ||||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. |  | ||||||
|                 (default: {download_modified_or_new}) |  | ||||||
|             clean {bool} -- Whether to clean up when the method finishes. |  | ||||||
|             timeout {int} -- The download timeout for opencast videos. Sadly needed due to a |  | ||||||
|                 requests bug. |  | ||||||
|         """ |  | ||||||
|         # This authenticator only works with the KIT ilias instance. |  | ||||||
|         authenticator = KitShibbolethAuthenticator(username=username, password=password) |  | ||||||
|         PRETTY.starting_synchronizer(target, "ILIAS", course_id) |  | ||||||
|  |  | ||||||
|         organizer = self._ilias( |  | ||||||
|             target=target, |  | ||||||
|             base_url="https://ilias.studium.kit.edu/", |  | ||||||
|             crawl_function=lambda crawler: crawler.crawl_course(course_id), |  | ||||||
|             authenticator=authenticator, |  | ||||||
|             cookies=cookies, |  | ||||||
|             dir_filter=dir_filter, |  | ||||||
|             transform=transform, |  | ||||||
|             download_strategy=download_strategy, |  | ||||||
|             clean=clean, |  | ||||||
|             timeout=timeout |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         self._download_summary.merge(organizer.download_summary) |  | ||||||
|  |  | ||||||
|         return organizer |  | ||||||
|  |  | ||||||
|     def print_summary(self) -> None: |  | ||||||
|         """ |  | ||||||
|         Prints the accumulated download summary. |  | ||||||
|         """ |  | ||||||
|         PRETTY.summary(self._download_summary) |  | ||||||
|  |  | ||||||
|     @swallow_and_print_errors |  | ||||||
|     def ilias_kit_personal_desktop( |  | ||||||
|             self, |  | ||||||
|             target: PathLike, |  | ||||||
|             dir_filter: IliasDirectoryFilter = lambda x, y: True, |  | ||||||
|             transform: Transform = lambda x: x, |  | ||||||
|             cookies: Optional[PathLike] = None, |  | ||||||
|             username: Optional[str] = None, |  | ||||||
|             password: Optional[str] = None, |  | ||||||
|             download_strategy: IliasDownloadStrategy = download_modified_or_new, |  | ||||||
|             clean: bool = True, |  | ||||||
|             timeout: int = 5, |  | ||||||
|     ) -> Organizer: |  | ||||||
|         """ |  | ||||||
|         Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS |  | ||||||
|         "personal desktop" instead of a single course. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             target {Path} -- the target path to write the data to |  | ||||||
|  |  | ||||||
|         Keyword Arguments: |  | ||||||
|             dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the |  | ||||||
|                 crawler level, these directories and all of their content is skipped. |  | ||||||
|                 (default: {lambdax:True}) |  | ||||||
|             transform {Transform} -- A transformation function for the output paths. Return None |  | ||||||
|                 to ignore a file. (default: {lambdax:x}) |  | ||||||
|             cookies {Optional[Path]} -- The path to store and load cookies from. |  | ||||||
|                 (default: {None}) |  | ||||||
|             username {Optional[str]} -- The SCC username. If none is given, it will prompt |  | ||||||
|                 the user. (default: {None}) |  | ||||||
|             password {Optional[str]} -- The SCC password. If none is given, it will prompt |  | ||||||
|                 the user. (default: {None}) |  | ||||||
|             download_strategy {DownloadStrategy} -- A function to determine which files need to |  | ||||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. |  | ||||||
|                 (default: {download_modified_or_new}) |  | ||||||
|             clean {bool} -- Whether to clean up when the method finishes. |  | ||||||
|             timeout {int} -- The download timeout for opencast videos. Sadly needed due to a |  | ||||||
|                 requests bug. |  | ||||||
|         """ |  | ||||||
|         # This authenticator only works with the KIT ilias instance. |  | ||||||
|         authenticator = KitShibbolethAuthenticator(username=username, password=password) |  | ||||||
|         PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop") |  | ||||||
|  |  | ||||||
|         organizer = self._ilias( |  | ||||||
|             target=target, |  | ||||||
|             base_url="https://ilias.studium.kit.edu/", |  | ||||||
|             crawl_function=lambda crawler: crawler.crawl_personal_desktop(), |  | ||||||
|             authenticator=authenticator, |  | ||||||
|             cookies=cookies, |  | ||||||
|             dir_filter=dir_filter, |  | ||||||
|             transform=transform, |  | ||||||
|             download_strategy=download_strategy, |  | ||||||
|             clean=clean, |  | ||||||
|             timeout=timeout |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         self._download_summary.merge(organizer.download_summary) |  | ||||||
|  |  | ||||||
|         return organizer |  | ||||||
|  |  | ||||||
|     @swallow_and_print_errors |  | ||||||
|     def ilias_kit_folder( |  | ||||||
|             self, |  | ||||||
|             target: PathLike, |  | ||||||
|             full_url: str, |  | ||||||
|             dir_filter: IliasDirectoryFilter = lambda x, y: True, |  | ||||||
|             transform: Transform = lambda x: x, |  | ||||||
|             cookies: Optional[PathLike] = None, |  | ||||||
|             username: Optional[str] = None, |  | ||||||
|             password: Optional[str] = None, |  | ||||||
|             download_strategy: IliasDownloadStrategy = download_modified_or_new, |  | ||||||
|             clean: bool = True, |  | ||||||
|             timeout: int = 5, |  | ||||||
|     ) -> Organizer: |  | ||||||
|         """ |  | ||||||
|         Synchronizes a folder with a given folder on the ILIAS instance of the KIT. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             target {Path}  -- the target path to write the data to |  | ||||||
|             full_url {str} -- the full url of the folder/videos/course to crawl |  | ||||||
|  |  | ||||||
|         Keyword Arguments: |  | ||||||
|             dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the |  | ||||||
|                 crawler level, these directories and all of their content is skipped. |  | ||||||
|                 (default: {lambdax:True}) |  | ||||||
|             transform {Transform} -- A transformation function for the output paths. Return None |  | ||||||
|                 to ignore a file. (default: {lambdax:x}) |  | ||||||
|             cookies {Optional[Path]} -- The path to store and load cookies from. |  | ||||||
|                 (default: {None}) |  | ||||||
|             username {Optional[str]} -- The SCC username. If none is given, it will prompt |  | ||||||
|                 the user. (default: {None}) |  | ||||||
|             password {Optional[str]} -- The SCC password. If none is given, it will prompt |  | ||||||
|                 the user. (default: {None}) |  | ||||||
|             download_strategy {DownloadStrategy} -- A function to determine which files need to |  | ||||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. |  | ||||||
|                 (default: {download_modified_or_new}) |  | ||||||
|             clean {bool} -- Whether to clean up when the method finishes. |  | ||||||
|             timeout {int} -- The download timeout for opencast videos. Sadly needed due to a |  | ||||||
|                 requests bug. |  | ||||||
|         """ |  | ||||||
|         # This authenticator only works with the KIT ilias instance. |  | ||||||
|         authenticator = KitShibbolethAuthenticator(username=username, password=password) |  | ||||||
|         PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url") |  | ||||||
|  |  | ||||||
|         if not full_url.startswith("https://ilias.studium.kit.edu"): |  | ||||||
|             raise FatalException("Not a valid KIT ILIAS URL") |  | ||||||
|  |  | ||||||
|         organizer = self._ilias( |  | ||||||
|             target=target, |  | ||||||
|             base_url="https://ilias.studium.kit.edu/", |  | ||||||
|             crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url), |  | ||||||
|             authenticator=authenticator, |  | ||||||
|             cookies=cookies, |  | ||||||
|             dir_filter=dir_filter, |  | ||||||
|             transform=transform, |  | ||||||
|             download_strategy=download_strategy, |  | ||||||
|             clean=clean, |  | ||||||
|             timeout=timeout |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         self._download_summary.merge(organizer.download_summary) |  | ||||||
|  |  | ||||||
|         return organizer |  | ||||||
|  |  | ||||||
|     @swallow_and_print_errors |  | ||||||
|     def ipd_kit( |  | ||||||
|             self, |  | ||||||
|             target: Union[PathLike, Organizer], |  | ||||||
|             url: str, |  | ||||||
|             transform: Transform = lambda x: x, |  | ||||||
|             download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified, |  | ||||||
|             clean: bool = True |  | ||||||
|     ) -> Organizer: |  | ||||||
|         """ |  | ||||||
|         Synchronizes a folder with a DIVA playlist. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             target {Union[PathLike, Organizer]} -- The organizer / target folder to use. |  | ||||||
|             url {str} -- the url to the page |  | ||||||
|  |  | ||||||
|         Keyword Arguments: |  | ||||||
|             transform {Transform} -- A transformation function for the output paths. Return None |  | ||||||
|                 to ignore a file. (default: {lambdax:x}) |  | ||||||
|             download_strategy {DivaDownloadStrategy} -- A function to determine which files need to |  | ||||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. |  | ||||||
|                 (default: {diva_download_new}) |  | ||||||
|             clean {bool} -- Whether to clean up when the method finishes. |  | ||||||
|         """ |  | ||||||
|         tmp_dir = self._tmp_dir.new_subdir() |  | ||||||
|  |  | ||||||
|         if target is None: |  | ||||||
|             PRETTY.starting_synchronizer("None", "IPD", url) |  | ||||||
|             raise FatalException("Got 'None' as target directory, aborting") |  | ||||||
|  |  | ||||||
|         if isinstance(target, Organizer): |  | ||||||
|             organizer = target |  | ||||||
|             else: |             else: | ||||||
|             organizer = Organizer(self.resolve(to_path(target))) |                 crawl_sections.append(name) | ||||||
|  |  | ||||||
|         PRETTY.starting_synchronizer(organizer.path, "IPD", url) |         return crawl_sections | ||||||
|  |  | ||||||
|         elements: List[IpdDownloadInfo] = IpdCrawler(url).crawl() |     def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: | ||||||
|         transformed = apply_transform(transform, elements) |         if len(cli_crawlers) != len(set(cli_crawlers)): | ||||||
|  |             raise PferdLoadError("Some crawlers were selected multiple times") | ||||||
|  |  | ||||||
|         if self._test_run: |         crawl_sections = [name for name, _ in config.crawl_sections()] | ||||||
|             self._print_transformables(transformed) |  | ||||||
|             return organizer |  | ||||||
|  |  | ||||||
|         downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy) |         crawlers_to_run = []  # With crawl: prefix | ||||||
|         downloader.download_all(transformed) |         unknown_names = []  # Without crawl: prefix | ||||||
|  |  | ||||||
|         if clean: |         for name in cli_crawlers: | ||||||
|             organizer.cleanup() |             section_name = f"crawl:{name}" | ||||||
|  |             if section_name in crawl_sections: | ||||||
|  |                 log.explain(f"Crawler section named {section_name!r} exists") | ||||||
|  |                 crawlers_to_run.append(section_name) | ||||||
|  |             else: | ||||||
|  |                 log.explain(f"There's no crawler section named {section_name!r}") | ||||||
|  |                 unknown_names.append(name) | ||||||
|  |  | ||||||
|         self._download_summary.merge(organizer.download_summary) |         if unknown_names: | ||||||
|  |             if len(unknown_names) == 1: | ||||||
|  |                 [name] = unknown_names | ||||||
|  |                 raise PferdLoadError(f"There is no crawler named {name!r}") | ||||||
|  |             else: | ||||||
|  |                 names_str = ", ".join(repr(name) for name in unknown_names) | ||||||
|  |                 raise PferdLoadError(f"There are no crawlers named {names_str}") | ||||||
|  |  | ||||||
|         return organizer |         return crawlers_to_run | ||||||
|  |  | ||||||
|     @swallow_and_print_errors |     def _find_crawlers_to_run( | ||||||
|     def diva_kit( |  | ||||||
|             self, |             self, | ||||||
|             target: Union[PathLike, Organizer], |             config: Config, | ||||||
|             playlist_location: str, |             cli_crawlers: Optional[List[str]], | ||||||
|             transform: Transform = lambda x: x, |             cli_skips: Optional[List[str]], | ||||||
|             download_strategy: DivaDownloadStrategy = diva_download_new, |     ) -> List[str]: | ||||||
|             clean: bool = True |         log.explain_topic("Deciding which crawlers to run") | ||||||
|     ) -> Organizer: |  | ||||||
|         """ |  | ||||||
|         Synchronizes a folder with a DIVA playlist. |  | ||||||
|  |  | ||||||
|         Arguments: |         crawlers: List[str] | ||||||
|             organizer {Organizer} -- The organizer to use. |         if cli_crawlers is None: | ||||||
|             playlist_location {str} -- the playlist id or the playlist URL |             log.explain("No crawlers specified on CLI") | ||||||
|               in the format 'https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271' |             log.explain("Running crawlers specified in config") | ||||||
|  |             crawlers = self._find_config_crawlers(config) | ||||||
|         Keyword Arguments: |  | ||||||
|             transform {Transform} -- A transformation function for the output paths. Return None |  | ||||||
|                 to ignore a file. (default: {lambdax:x}) |  | ||||||
|             download_strategy {DivaDownloadStrategy} -- A function to determine which files need to |  | ||||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. |  | ||||||
|                 (default: {diva_download_new}) |  | ||||||
|             clean {bool} -- Whether to clean up when the method finishes. |  | ||||||
|         """ |  | ||||||
|         tmp_dir = self._tmp_dir.new_subdir() |  | ||||||
|  |  | ||||||
|         if playlist_location.startswith("http"): |  | ||||||
|             playlist_id = DivaPlaylistCrawler.fetch_id(playlist_link=playlist_location) |  | ||||||
|         else: |         else: | ||||||
|             playlist_id = playlist_location |             log.explain("Crawlers specified on CLI") | ||||||
|  |             crawlers = self._find_cli_crawlers(config, cli_crawlers) | ||||||
|  |  | ||||||
|         if target is None: |         skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set() | ||||||
|             PRETTY.starting_synchronizer("None", "DIVA", playlist_id) |         for crawler in crawlers: | ||||||
|             raise FatalException("Got 'None' as target directory, aborting") |             if crawler in skips: | ||||||
|  |                 log.explain(f"Skipping crawler {crawler!r}") | ||||||
|  |         crawlers = [crawler for crawler in crawlers if crawler not in skips] | ||||||
|  |  | ||||||
|         if isinstance(target, Organizer): |         return crawlers | ||||||
|             organizer = target |  | ||||||
|         else: |  | ||||||
|             organizer = Organizer(self.resolve(to_path(target))) |  | ||||||
|  |  | ||||||
|         PRETTY.starting_synchronizer(organizer.path, "DIVA", playlist_id) |     def _load_authenticators(self) -> None: | ||||||
|  |         for name, section in self._config.auth_sections(): | ||||||
|  |             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||||
|  |  | ||||||
|         crawler = DivaPlaylistCrawler(playlist_id) |             auth_type = AuthSection(section).type() | ||||||
|         downloader = DivaDownloader(tmp_dir, organizer, download_strategy) |             authenticator_constructor = AUTHENTICATORS.get(auth_type) | ||||||
|  |             if authenticator_constructor is None: | ||||||
|  |                 raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") | ||||||
|  |  | ||||||
|         info = crawler.crawl() |             authenticator = authenticator_constructor(name, section, self._config) | ||||||
|  |             self._authenticators[name] = authenticator | ||||||
|  |  | ||||||
|         transformed = apply_transform(transform, info) |     def _load_crawlers(self) -> None: | ||||||
|         if self._test_run: |         # Cookie sharing | ||||||
|             self._print_transformables(transformed) |         kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} | ||||||
|             return organizer |  | ||||||
|  |  | ||||||
|         downloader.download_all(transformed) |         for name, section in self._config.crawl_sections(): | ||||||
|  |             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||||
|  |  | ||||||
|         if clean: |             crawl_type = CrawlerSection(section).type() | ||||||
|             organizer.cleanup() |             crawler_constructor = CRAWLERS.get(crawl_type) | ||||||
|  |             if crawler_constructor is None: | ||||||
|  |                 raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") | ||||||
|  |  | ||||||
|         self._download_summary.merge(organizer.download_summary) |             crawler = crawler_constructor(name, section, self._config, self._authenticators) | ||||||
|  |             self._crawlers[name] = crawler | ||||||
|  |  | ||||||
|         return organizer |             if self._config.default_section.share_cookies(): | ||||||
|  |                 if isinstance(crawler, KitIliasWebCrawler): | ||||||
|  |                     crawler.share_cookies(kit_ilias_web_paths) | ||||||
|  |  | ||||||
|  |     def debug_transforms(self) -> None: | ||||||
|  |         for name in self._crawlers_to_run: | ||||||
|  |             crawler = self._crawlers[name] | ||||||
|  |             log.print("") | ||||||
|  |             log.print(f"[bold bright_cyan]Debugging transforms[/] for {escape(name)}") | ||||||
|  |             crawler.debug_transforms() | ||||||
|  |  | ||||||
|  |     async def run(self, debug_transforms: bool) -> None: | ||||||
|  |         """ | ||||||
|  |         May throw ConfigOptionError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # These two functions must run inside the same event loop as the | ||||||
|  |         # crawlers, so that any new objects (like Conditions or Futures) can | ||||||
|  |         # obtain the correct event loop. | ||||||
|  |         self._load_authenticators() | ||||||
|  |         self._load_crawlers() | ||||||
|  |  | ||||||
|  |         if debug_transforms: | ||||||
|  |             log.output_explain = True | ||||||
|  |             log.output_report = False | ||||||
|  |             self.debug_transforms() | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         log.print("") | ||||||
|  |  | ||||||
|  |         for name in self._crawlers_to_run: | ||||||
|  |             crawler = self._crawlers[name] | ||||||
|  |  | ||||||
|  |             log.print(f"[bold bright_cyan]Running[/] {escape(name)}") | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 await crawler.run() | ||||||
|  |             except (CrawlError, AuthError) as e: | ||||||
|  |                 log.error(str(e)) | ||||||
|  |             except Exception: | ||||||
|  |                 log.unexpected_exception() | ||||||
|  |  | ||||||
|  |     def print_report(self) -> None: | ||||||
|  |         for name in self._crawlers_to_run: | ||||||
|  |             crawler = self._crawlers.get(name) | ||||||
|  |             if crawler is None: | ||||||
|  |                 continue  # Crawler failed to load | ||||||
|  |  | ||||||
|  |             log.report("") | ||||||
|  |             log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") | ||||||
|  |  | ||||||
|  |             def fmt_path_link(relative_path: PurePath) -> str: | ||||||
|  |                 # We need to URL-encode the path because it might contain spaces or special characters | ||||||
|  |                 link = crawler.output_dir.resolve(relative_path).absolute().as_uri() | ||||||
|  |                 return f"[link={link}]{fmt_path(relative_path)}[/link]" | ||||||
|  |  | ||||||
|  |             something_changed = False | ||||||
|  |             for path in sorted(crawler.report.added_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_green]Added[/] {fmt_path_link(path)}") | ||||||
|  |             for path in sorted(crawler.report.changed_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_yellow]Changed[/] {fmt_path_link(path)}") | ||||||
|  |             for path in sorted(crawler.report.deleted_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") | ||||||
|  |             for path in sorted(crawler.report.not_deleted_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report_not_deleted(f"  [bold bright_magenta]Not deleted[/] {fmt_path_link(path)}") | ||||||
|  |  | ||||||
|  |             for warning in crawler.report.encountered_warnings: | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_red]Warning[/] {warning}") | ||||||
|  |  | ||||||
|  |             for error in crawler.report.encountered_errors: | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_red]Error[/] {error}") | ||||||
|  |  | ||||||
|  |             if not something_changed: | ||||||
|  |                 log.report("  Nothing changed") | ||||||
|   | |||||||
| @@ -1,111 +0,0 @@ | |||||||
| """ |  | ||||||
| A small progress bar implementation. |  | ||||||
| """ |  | ||||||
| import sys |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from types import TracebackType |  | ||||||
| from typing import Optional, Type |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
| from rich.console import Console |  | ||||||
| from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, |  | ||||||
|                            TextColumn, TimeRemainingColumn, |  | ||||||
|                            TransferSpeedColumn) |  | ||||||
|  |  | ||||||
| _progress: Progress = Progress( |  | ||||||
|     TextColumn("[bold blue]{task.fields[name]}", justify="right"), |  | ||||||
|     BarColumn(bar_width=None), |  | ||||||
|     "[progress.percentage]{task.percentage:>3.1f}%", |  | ||||||
|     "•", |  | ||||||
|     DownloadColumn(), |  | ||||||
|     "•", |  | ||||||
|     TransferSpeedColumn(), |  | ||||||
|     "•", |  | ||||||
|     TimeRemainingColumn(), |  | ||||||
|     console=Console(file=sys.stdout), |  | ||||||
|     transient=True |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def size_from_headers(response: requests.Response) -> Optional[int]: |  | ||||||
|     """ |  | ||||||
|     Return the size of the download based on the response headers. |  | ||||||
|  |  | ||||||
|     Arguments: |  | ||||||
|         response {requests.Response} -- the response |  | ||||||
|  |  | ||||||
|     Returns: |  | ||||||
|         Optional[int] -- the size |  | ||||||
|     """ |  | ||||||
|     if "Content-Length" in response.headers: |  | ||||||
|         return int(response.headers["Content-Length"]) |  | ||||||
|     return None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class ProgressSettings: |  | ||||||
|     """ |  | ||||||
|     Settings you can pass to customize the progress bar. |  | ||||||
|     """ |  | ||||||
|     name: str |  | ||||||
|     max_size: int |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def progress_for(settings: Optional[ProgressSettings]) -> 'ProgressContextManager': |  | ||||||
|     """ |  | ||||||
|     Returns a context manager that displays progress |  | ||||||
|  |  | ||||||
|     Returns: |  | ||||||
|         ProgressContextManager -- the progress manager |  | ||||||
|     """ |  | ||||||
|     return ProgressContextManager(settings) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ProgressContextManager: |  | ||||||
|     """ |  | ||||||
|     A context manager used for displaying progress. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, settings: Optional[ProgressSettings]): |  | ||||||
|         self._settings = settings |  | ||||||
|         self._task_id: Optional[TaskID] = None |  | ||||||
|  |  | ||||||
|     def __enter__(self) -> 'ProgressContextManager': |  | ||||||
|         """Context manager entry function.""" |  | ||||||
|         if not self._settings: |  | ||||||
|             return self |  | ||||||
|  |  | ||||||
|         _progress.start() |  | ||||||
|         self._task_id = _progress.add_task( |  | ||||||
|             self._settings.name, |  | ||||||
|             total=self._settings.max_size, |  | ||||||
|             name=self._settings.name |  | ||||||
|         ) |  | ||||||
|         return self |  | ||||||
|  |  | ||||||
|     # pylint: disable=useless-return |  | ||||||
|     def __exit__( |  | ||||||
|             self, |  | ||||||
|             exc_type: Optional[Type[BaseException]], |  | ||||||
|             exc_value: Optional[BaseException], |  | ||||||
|             traceback: Optional[TracebackType], |  | ||||||
|     ) -> Optional[bool]: |  | ||||||
|         """Context manager exit function. Removes the task.""" |  | ||||||
|         if self._task_id is None: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         _progress.remove_task(self._task_id) |  | ||||||
|  |  | ||||||
|         if len(_progress.task_ids) == 0: |  | ||||||
|             # We need to clean up after ourselves, as we were the last one |  | ||||||
|             _progress.stop() |  | ||||||
|             _progress.refresh() |  | ||||||
|  |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def advance(self, amount: float) -> None: |  | ||||||
|         """ |  | ||||||
|         Advances the progress bar. |  | ||||||
|         """ |  | ||||||
|         if self._task_id is not None: |  | ||||||
|             _progress.advance(self._task_id, amount) |  | ||||||
							
								
								
									
										229
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										229
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,229 @@ | |||||||
|  | import json | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Any, Dict, List, Optional, Set | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ReportLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class MarkDuplicateError(Exception): | ||||||
|  |     """ | ||||||
|  |     Tried to mark a file that was already marked. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, path: PurePath): | ||||||
|  |         super().__init__(f"A previous file already used path {path}") | ||||||
|  |         self.path = path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class MarkConflictError(Exception): | ||||||
|  |     """ | ||||||
|  |     Marking the path would have caused a conflict. | ||||||
|  |  | ||||||
|  |     A conflict can have two reasons: Either the new file has the same path as | ||||||
|  |     the parent directory of a known file, or a parent directory of the new file | ||||||
|  |     has the same path as a known file. In either case, adding the new file | ||||||
|  |     would require a file and a directory to share the same path, which is | ||||||
|  |     usually not possible. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, path: PurePath, collides_with: PurePath): | ||||||
|  |         super().__init__(f"File at {path} collides with previous file at {collides_with}") | ||||||
|  |         self.path = path | ||||||
|  |         self.collides_with = collides_with | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Report: | ||||||
|  |     """ | ||||||
|  |     A report of a synchronization. Includes all files found by the crawler, as | ||||||
|  |     well as the set of changes made to local files. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self) -> None: | ||||||
|  |         # Paths found by the crawler, untransformed | ||||||
|  |         self.found_paths: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         # Files reserved for metadata files (e. g. the report file or cookies) | ||||||
|  |         # that can't be overwritten by user transforms and won't be cleaned up | ||||||
|  |         # at the end. | ||||||
|  |         self.reserved_files: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         # Files found by the crawler, transformed. Only includes files that | ||||||
|  |         # were downloaded (or a download was attempted) | ||||||
|  |         self.known_files: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         self.added_files: Set[PurePath] = set() | ||||||
|  |         self.changed_files: Set[PurePath] = set() | ||||||
|  |         self.deleted_files: Set[PurePath] = set() | ||||||
|  |         # Files that should have been deleted by the cleanup but weren't | ||||||
|  |         self.not_deleted_files: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         # Custom crawler-specific data | ||||||
|  |         self.custom: Dict[str, Any] = dict() | ||||||
|  |  | ||||||
|  |         # Encountered errors and warnings | ||||||
|  |         self.encountered_warnings: List[str] = [] | ||||||
|  |         self.encountered_errors: List[str] = [] | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: | ||||||
|  |         result: Any = data.get(key, []) | ||||||
|  |  | ||||||
|  |         if not isinstance(result, list): | ||||||
|  |             raise ReportLoadError(f"Incorrect format: {key!r} is not a list") | ||||||
|  |  | ||||||
|  |         for elem in result: | ||||||
|  |             if not isinstance(elem, str): | ||||||
|  |                 raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings") | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_str_dictionary(data: Dict[str, Any], key: str) -> Dict[str, Any]: | ||||||
|  |         result: Dict[str, Any] = data.get(key, {}) | ||||||
|  |  | ||||||
|  |         if not isinstance(result, dict): | ||||||
|  |             raise ReportLoadError(f"Incorrect format: {key!r} is not a dictionary") | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def load(cls, path: Path) -> "Report": | ||||||
|  |         """ | ||||||
|  |         May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         with open(path, encoding="utf-8") as f: | ||||||
|  |             data = json.load(f) | ||||||
|  |  | ||||||
|  |         if not isinstance(data, dict): | ||||||
|  |             raise ReportLoadError("Incorrect format: Root is not an object") | ||||||
|  |  | ||||||
|  |         self = cls() | ||||||
|  |         for elem in self._get_list_of_strs(data, "found"): | ||||||
|  |             self.found(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "reserved"): | ||||||
|  |             self.mark_reserved(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "known"): | ||||||
|  |             self.mark(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "added"): | ||||||
|  |             self.add_file(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "changed"): | ||||||
|  |             self.change_file(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "deleted"): | ||||||
|  |             self.delete_file(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "not_deleted"): | ||||||
|  |             self.not_delete_file(PurePath(elem)) | ||||||
|  |         self.custom = self._get_str_dictionary(data, "custom") | ||||||
|  |         self.encountered_errors = self._get_list_of_strs(data, "encountered_errors") | ||||||
|  |         self.encountered_warnings = self._get_list_of_strs(data, "encountered_warnings") | ||||||
|  |  | ||||||
|  |         return self | ||||||
|  |  | ||||||
|  |     def store(self, path: Path) -> None: | ||||||
|  |         """ | ||||||
|  |         May raise OSError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         data = { | ||||||
|  |             "found": [str(path) for path in sorted(self.found_paths)], | ||||||
|  |             "reserved": [str(path) for path in sorted(self.reserved_files)], | ||||||
|  |             "known": [str(path) for path in sorted(self.known_files)], | ||||||
|  |             "added": [str(path) for path in sorted(self.added_files)], | ||||||
|  |             "changed": [str(path) for path in sorted(self.changed_files)], | ||||||
|  |             "deleted": [str(path) for path in sorted(self.deleted_files)], | ||||||
|  |             "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], | ||||||
|  |             "custom": self.custom, | ||||||
|  |             "encountered_warnings": self.encountered_warnings, | ||||||
|  |             "encountered_errors": self.encountered_errors, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         with open(path, "w", encoding="utf-8") as f: | ||||||
|  |             json.dump(data, f, indent=2, sort_keys=True) | ||||||
|  |             f.write("\n")  # json.dump doesn't do this | ||||||
|  |  | ||||||
|  |     def found(self, path: PurePath) -> None: | ||||||
|  |         self.found_paths.add(path) | ||||||
|  |  | ||||||
|  |     def mark_reserved(self, path: PurePath) -> None: | ||||||
|  |         if path in self.marked: | ||||||
|  |             raise RuntimeError("Trying to reserve an already reserved file") | ||||||
|  |  | ||||||
|  |         self.reserved_files.add(path) | ||||||
|  |  | ||||||
|  |     def mark(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Mark a previously unknown file as known. | ||||||
|  |  | ||||||
|  |         May throw a MarkDuplicateError or a MarkConflictError. For more detail, | ||||||
|  |         see the respective exception's docstring. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         for other in self.marked: | ||||||
|  |             if path == other: | ||||||
|  |                 raise MarkDuplicateError(path) | ||||||
|  |  | ||||||
|  |             if path.is_relative_to(other) or other.is_relative_to(path): | ||||||
|  |                 raise MarkConflictError(path, other) | ||||||
|  |  | ||||||
|  |         self.known_files.add(path) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def marked(self) -> Set[PurePath]: | ||||||
|  |         return self.known_files | self.reserved_files | ||||||
|  |  | ||||||
|  |     def is_marked(self, path: PurePath) -> bool: | ||||||
|  |         return path in self.marked | ||||||
|  |  | ||||||
|  |     def add_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.added_files.add(path) | ||||||
|  |  | ||||||
|  |     def change_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.changed_files.add(path) | ||||||
|  |  | ||||||
|  |     def delete_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.deleted_files.add(path) | ||||||
|  |  | ||||||
|  |     def not_delete_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.not_deleted_files.add(path) | ||||||
|  |  | ||||||
|  |     def add_custom_value(self, key: str, value: Any) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds a custom value under the passed key, overwriting any existing | ||||||
|  |         """ | ||||||
|  |         self.custom[key] = value | ||||||
|  |  | ||||||
|  |     def get_custom_value(self, key: str) -> Optional[Any]: | ||||||
|  |         """ | ||||||
|  |         Retrieves a custom value for the given key. | ||||||
|  |         """ | ||||||
|  |         return self.custom.get(key) | ||||||
|  |  | ||||||
|  |     def add_error(self, error: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds an error to this report's error list. | ||||||
|  |         """ | ||||||
|  |         self.encountered_errors.append(error) | ||||||
|  |  | ||||||
|  |     def add_warning(self, warning: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds a warning to this report's warning list. | ||||||
|  |         """ | ||||||
|  |         self.encountered_warnings.append(warning) | ||||||
| @@ -1,79 +0,0 @@ | |||||||
| """Helper functions and classes for temporary folders.""" |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| import shutil |  | ||||||
| from pathlib import Path |  | ||||||
| from types import TracebackType |  | ||||||
| from typing import Optional, Type |  | ||||||
|  |  | ||||||
| from .location import Location |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TmpDir(Location): |  | ||||||
|     """A temporary folder that can create files or nested temp folders.""" |  | ||||||
|  |  | ||||||
|     def __init__(self, path: Path): |  | ||||||
|         """Create a new temporary folder for the given path.""" |  | ||||||
|         super().__init__(path) |  | ||||||
|         self._counter = 0 |  | ||||||
|         self.cleanup() |  | ||||||
|         self.path.mkdir(parents=True, exist_ok=True) |  | ||||||
|  |  | ||||||
|     def __str__(self) -> str: |  | ||||||
|         """Format the folder as a string.""" |  | ||||||
|         return f"Folder at {self.path}" |  | ||||||
|  |  | ||||||
|     def __enter__(self) -> 'TmpDir': |  | ||||||
|         """Context manager entry function.""" |  | ||||||
|         return self |  | ||||||
|  |  | ||||||
|     # pylint: disable=useless-return |  | ||||||
|     def __exit__( |  | ||||||
|             self, |  | ||||||
|             exc_type: Optional[Type[BaseException]], |  | ||||||
|             exc_value: Optional[BaseException], |  | ||||||
|             traceback: Optional[TracebackType], |  | ||||||
|     ) -> Optional[bool]: |  | ||||||
|         """Context manager exit function. Calls cleanup().""" |  | ||||||
|         self.cleanup() |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def new_path(self, prefix: Optional[str] = None) -> Path: |  | ||||||
|         """ |  | ||||||
|         Return a unique path inside the directory. Doesn't create a file or |  | ||||||
|         directory. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Creating temp file %s", name) |  | ||||||
|  |  | ||||||
|         return self.resolve(Path(name)) |  | ||||||
|  |  | ||||||
|     def new_subdir(self, prefix: Optional[str] = None) -> 'TmpDir': |  | ||||||
|         """ |  | ||||||
|         Create a new nested temporary folder and return it. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" |  | ||||||
|         sub_path = self.resolve(Path(name)) |  | ||||||
|         sub_path.mkdir(parents=True) |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Creating temp dir %s at %s", name, sub_path) |  | ||||||
|  |  | ||||||
|         return TmpDir(sub_path) |  | ||||||
|  |  | ||||||
|     def cleanup(self) -> None: |  | ||||||
|         """Delete this folder and all contained files.""" |  | ||||||
|         LOGGER.debug("Deleting temp folder %s", self.path) |  | ||||||
|  |  | ||||||
|         if self.path.resolve().exists(): |  | ||||||
|             shutil.rmtree(self.path.resolve()) |  | ||||||
|  |  | ||||||
|     def _inc_and_get_counter(self) -> int: |  | ||||||
|         """Get and increment the counter by one.""" |  | ||||||
|         counter = self._counter |  | ||||||
|         self._counter += 1 |  | ||||||
|         return counter |  | ||||||
| @@ -1,142 +0,0 @@ | |||||||
| """ |  | ||||||
| Transforms let the user define functions to decide where the downloaded files |  | ||||||
| should be placed locally. They let the user do more advanced things like moving |  | ||||||
| only files whose names match a regex, or renaming files from one numbering |  | ||||||
| scheme to another. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import os |  | ||||||
| import re |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from pathlib import PurePath |  | ||||||
| from typing import Callable, List, Optional, TypeVar |  | ||||||
|  |  | ||||||
| from .utils import PathLike, Regex, to_path, to_pattern |  | ||||||
|  |  | ||||||
| Transform = Callable[[PurePath], Optional[PurePath]] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class Transformable: |  | ||||||
|     """ |  | ||||||
|     An object that can be transformed by a Transform. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     path: PurePath |  | ||||||
|  |  | ||||||
|  |  | ||||||
| TF = TypeVar("TF", bound=Transformable) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def apply_transform( |  | ||||||
|         transform: Transform, |  | ||||||
|         transformables: List[TF], |  | ||||||
| ) -> List[TF]: |  | ||||||
|     """ |  | ||||||
|     Apply a Transform to multiple Transformables, discarding those that were |  | ||||||
|     not transformed by the Transform. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     result: List[TF] = [] |  | ||||||
|     for transformable in transformables: |  | ||||||
|         new_path = transform(transformable.path) |  | ||||||
|         if new_path: |  | ||||||
|             transformable.path = new_path |  | ||||||
|             result.append(transformable) |  | ||||||
|     return result |  | ||||||
|  |  | ||||||
| # Transform combinators |  | ||||||
|  |  | ||||||
| def keep(path: PurePath) -> Optional[PurePath]: |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def attempt(*args: Transform) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         for transform in args: |  | ||||||
|             result = transform(path) |  | ||||||
|             if result: |  | ||||||
|                 return result |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def optionally(transform: Transform) -> Transform: |  | ||||||
|     return attempt(transform, lambda path: path) |  | ||||||
|  |  | ||||||
| def do(*args: Transform) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         current = path |  | ||||||
|         for transform in args: |  | ||||||
|             result = transform(current) |  | ||||||
|             if result: |  | ||||||
|                 current = result |  | ||||||
|             else: |  | ||||||
|                 return None |  | ||||||
|         return current |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def predicate(pred: Callable[[PurePath], bool]) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if pred(path): |  | ||||||
|             return path |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def glob(pattern: str) -> Transform: |  | ||||||
|     return predicate(lambda path: path.match(pattern)) |  | ||||||
|  |  | ||||||
| def move_dir(source_dir: PathLike, target_dir: PathLike) -> Transform: |  | ||||||
|     source_path = to_path(source_dir) |  | ||||||
|     target_path = to_path(target_dir) |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if source_path in path.parents: |  | ||||||
|             return target_path / path.relative_to(source_path) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def move(source: PathLike, target: PathLike) -> Transform: |  | ||||||
|     source_path = to_path(source) |  | ||||||
|     target_path = to_path(target) |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if path == source_path: |  | ||||||
|             return target_path |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def rename(source: str, target: str) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if path.name == source: |  | ||||||
|             return path.with_name(target) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def re_move(regex: Regex, target: str) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         match = to_pattern(regex).fullmatch(str(path)) |  | ||||||
|         if match: |  | ||||||
|             groups = [match.group(0)] |  | ||||||
|             groups.extend(match.groups()) |  | ||||||
|             return PurePath(target.format(*groups)) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def re_rename(regex: Regex, target: str) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         match = to_pattern(regex).fullmatch(path.name) |  | ||||||
|         if match: |  | ||||||
|             groups = [match.group(0)] |  | ||||||
|             groups.extend(match.groups()) |  | ||||||
|             return path.with_name(target.format(*groups)) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def sanitize_windows_path(path: PurePath) -> Optional[PurePath]: |  | ||||||
|     """ |  | ||||||
|     A small function to escape characters that are forbidden in windows path names. |  | ||||||
|     This method is a no-op on other operating systems. |  | ||||||
|     """ |  | ||||||
|     # Escape windows illegal path characters |  | ||||||
|     if os.name == 'nt': |  | ||||||
|         sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)] |  | ||||||
|         return PurePath(*sanitized_parts) |  | ||||||
|     return path |  | ||||||
							
								
								
									
										443
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										443
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,443 @@ | |||||||
|  | import ast | ||||||
|  | import re | ||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from enum import Enum | ||||||
|  | from pathlib import PurePath | ||||||
|  | from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_path, str_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ArrowHead(Enum): | ||||||
|  |     NORMAL = 0 | ||||||
|  |     SEQUENCE = 1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Ignore: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Empty: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | RightSide = Union[str, Ignore, Empty] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Transformed: | ||||||
|  |     path: PurePath | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Ignored: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | TransformResult = Optional[Union[Transformed, Ignored]] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Rule: | ||||||
|  |     left: str | ||||||
|  |     left_index: int | ||||||
|  |     name: str | ||||||
|  |     head: ArrowHead | ||||||
|  |     right: RightSide | ||||||
|  |     right_index: int | ||||||
|  |  | ||||||
|  |     def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: | ||||||
|  |         if isinstance(self.right, str): | ||||||
|  |             return self.right | ||||||
|  |         elif isinstance(self.right, Ignore): | ||||||
|  |             return Ignored() | ||||||
|  |         elif isinstance(self.right, Empty): | ||||||
|  |             return Transformed(path) | ||||||
|  |         else: | ||||||
|  |             raise RuntimeError(f"Right side has invalid type {type(self.right)}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Transformation(ABC): | ||||||
|  |     def __init__(self, rule: Rule): | ||||||
|  |         self.rule = rule | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ExactTf(Transformation): | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         if path != PurePath(self.rule.left): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         right = self.rule.right_result(path) | ||||||
|  |         if not isinstance(right, str): | ||||||
|  |             return right | ||||||
|  |  | ||||||
|  |         return Transformed(PurePath(right)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ExactReTf(Transformation): | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         match = re.fullmatch(self.rule.left, str_path(path)) | ||||||
|  |         if not match: | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         right = self.rule.right_result(path) | ||||||
|  |         if not isinstance(right, str): | ||||||
|  |             return right | ||||||
|  |  | ||||||
|  |         # For some reason, mypy thinks that "groups" has type List[str]. But | ||||||
|  |         # since elements of "match.groups()" can be None, mypy is wrong. | ||||||
|  |         groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) | ||||||
|  |  | ||||||
|  |         locals_dir: Dict[str, Union[str, int, float]] = {} | ||||||
|  |         for i, group in enumerate(groups): | ||||||
|  |             if group is None: | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             locals_dir[f"g{i}"] = group | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 locals_dir[f"i{i}"] = int(group) | ||||||
|  |             except ValueError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 locals_dir[f"f{i}"] = float(group) | ||||||
|  |             except ValueError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |         named_groups: Dict[str, str] = match.groupdict() | ||||||
|  |         for name, capture in named_groups.items(): | ||||||
|  |             locals_dir[name] = capture | ||||||
|  |  | ||||||
|  |         result = eval(f"f{right!r}", {}, locals_dir) | ||||||
|  |         return Transformed(PurePath(result)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RenamingParentsTf(Transformation): | ||||||
|  |     def __init__(self, sub_tf: Transformation): | ||||||
|  |         super().__init__(sub_tf.rule) | ||||||
|  |         self.sub_tf = sub_tf | ||||||
|  |  | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         for i in range(len(path.parts), -1, -1): | ||||||
|  |             parent = PurePath(*path.parts[:i]) | ||||||
|  |             child = PurePath(*path.parts[i:]) | ||||||
|  |  | ||||||
|  |             transformed = self.sub_tf.transform(parent) | ||||||
|  |             if not transformed: | ||||||
|  |                 continue | ||||||
|  |             elif isinstance(transformed, Transformed): | ||||||
|  |                 return Transformed(transformed.path / child) | ||||||
|  |             elif isinstance(transformed, Ignored): | ||||||
|  |                 return transformed | ||||||
|  |             else: | ||||||
|  |                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||||
|  |  | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RenamingPartsTf(Transformation): | ||||||
|  |     def __init__(self, sub_tf: Transformation): | ||||||
|  |         super().__init__(sub_tf.rule) | ||||||
|  |         self.sub_tf = sub_tf | ||||||
|  |  | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         result = PurePath() | ||||||
|  |         any_part_matched = False | ||||||
|  |         for part in path.parts: | ||||||
|  |             transformed = self.sub_tf.transform(PurePath(part)) | ||||||
|  |             if not transformed: | ||||||
|  |                 result /= part | ||||||
|  |             elif isinstance(transformed, Transformed): | ||||||
|  |                 result /= transformed.path | ||||||
|  |                 any_part_matched = True | ||||||
|  |             elif isinstance(transformed, Ignored): | ||||||
|  |                 return transformed | ||||||
|  |             else: | ||||||
|  |                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||||
|  |  | ||||||
|  |         if any_part_matched: | ||||||
|  |             return Transformed(result) | ||||||
|  |         else: | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RuleParseError(Exception): | ||||||
|  |     def __init__(self, line: "Line", reason: str): | ||||||
|  |         super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}") | ||||||
|  |  | ||||||
|  |         self.line = line | ||||||
|  |         self.reason = reason | ||||||
|  |  | ||||||
|  |     def pretty_print(self) -> None: | ||||||
|  |         log.error(f"Error parsing rule on line {self.line.line_nr}:") | ||||||
|  |         log.error_contd(self.line.line) | ||||||
|  |         spaces = " " * self.line.index | ||||||
|  |         log.error_contd(f"{spaces}^--- {self.reason}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | T = TypeVar("T") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Line: | ||||||
|  |     def __init__(self, line: str, line_nr: int): | ||||||
|  |         self._line = line | ||||||
|  |         self._line_nr = line_nr | ||||||
|  |         self._index = 0 | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def line(self) -> str: | ||||||
|  |         return self._line | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def line_nr(self) -> int: | ||||||
|  |         return self._line_nr | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def index(self) -> int: | ||||||
|  |         return self._index | ||||||
|  |  | ||||||
|  |     @index.setter | ||||||
|  |     def index(self, index: int) -> None: | ||||||
|  |         self._index = index | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def rest(self) -> str: | ||||||
|  |         return self.line[self.index:] | ||||||
|  |  | ||||||
|  |     def peek(self, amount: int = 1) -> str: | ||||||
|  |         return self.rest[:amount] | ||||||
|  |  | ||||||
|  |     def take(self, amount: int = 1) -> str: | ||||||
|  |         string = self.peek(amount) | ||||||
|  |         self.index += len(string) | ||||||
|  |         return string | ||||||
|  |  | ||||||
|  |     def expect(self, string: str) -> str: | ||||||
|  |         if self.peek(len(string)) == string: | ||||||
|  |             return self.take(len(string)) | ||||||
|  |         else: | ||||||
|  |             raise RuleParseError(self, f"Expected {string!r}") | ||||||
|  |  | ||||||
|  |     def expect_with(self, string: str, value: T) -> T: | ||||||
|  |         self.expect(string) | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def one_of(self, parsers: List[Callable[[], T]], description: str) -> T: | ||||||
|  |         for parser in parsers: | ||||||
|  |             index = self.index | ||||||
|  |             try: | ||||||
|  |                 return parser() | ||||||
|  |             except RuleParseError: | ||||||
|  |                 self.index = index | ||||||
|  |  | ||||||
|  |         raise RuleParseError(self, description) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # RULE = LEFT SPACE '-' NAME '-' HEAD (SPACE RIGHT)? | ||||||
|  | # SPACE = ' '+ | ||||||
|  | # NAME = '' | 'exact' | 'name' | 're' | 'exact-re' | 'name-re' | ||||||
|  | # HEAD = '>' | '>>' | ||||||
|  | # LEFT = STR | QUOTED_STR | ||||||
|  | # RIGHT = STR | QUOTED_STR | '!' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_zero_or_more_spaces(line: Line) -> None: | ||||||
|  |     while line.peek() == " ": | ||||||
|  |         line.take() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_one_or_more_spaces(line: Line) -> None: | ||||||
|  |     line.expect(" ") | ||||||
|  |     parse_zero_or_more_spaces(line) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_str(line: Line) -> str: | ||||||
|  |     result = [] | ||||||
|  |     while c := line.peek(): | ||||||
|  |         if c == " ": | ||||||
|  |             break | ||||||
|  |         else: | ||||||
|  |             line.take() | ||||||
|  |             result.append(c) | ||||||
|  |  | ||||||
|  |     if result: | ||||||
|  |         return "".join(result) | ||||||
|  |     else: | ||||||
|  |         raise RuleParseError(line, "Expected non-space character") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | QUOTATION_MARKS = {'"', "'"} | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_quoted_str(line: Line) -> str: | ||||||
|  |     escaped = False | ||||||
|  |  | ||||||
|  |     # Points to first character of string literal | ||||||
|  |     start_index = line.index | ||||||
|  |  | ||||||
|  |     quotation_mark = line.peek() | ||||||
|  |     if quotation_mark not in QUOTATION_MARKS: | ||||||
|  |         raise RuleParseError(line, "Expected quotation mark") | ||||||
|  |     line.take() | ||||||
|  |  | ||||||
|  |     while c := line.peek(): | ||||||
|  |         if escaped: | ||||||
|  |             escaped = False | ||||||
|  |             line.take() | ||||||
|  |         elif c == quotation_mark: | ||||||
|  |             line.take() | ||||||
|  |             stop_index = line.index | ||||||
|  |             literal = line.line[start_index:stop_index] | ||||||
|  |             try: | ||||||
|  |                 return ast.literal_eval(literal) | ||||||
|  |             except SyntaxError as e: | ||||||
|  |                 line.index = start_index | ||||||
|  |                 raise RuleParseError(line, str(e)) from e | ||||||
|  |         elif c == "\\": | ||||||
|  |             escaped = True | ||||||
|  |             line.take() | ||||||
|  |         else: | ||||||
|  |             line.take() | ||||||
|  |  | ||||||
|  |     raise RuleParseError(line, "Expected end of string literal") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_left(line: Line) -> str: | ||||||
|  |     if line.peek() in QUOTATION_MARKS: | ||||||
|  |         return parse_quoted_str(line) | ||||||
|  |     else: | ||||||
|  |         return parse_str(line) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_right(line: Line) -> Union[str, Ignore]: | ||||||
|  |     c = line.peek() | ||||||
|  |     if c in QUOTATION_MARKS: | ||||||
|  |         return parse_quoted_str(line) | ||||||
|  |     else: | ||||||
|  |         string = parse_str(line) | ||||||
|  |         if string == "!": | ||||||
|  |             return Ignore() | ||||||
|  |         return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_arrow_name(line: Line) -> str: | ||||||
|  |     return line.one_of([ | ||||||
|  |         lambda: line.expect("exact-re"), | ||||||
|  |         lambda: line.expect("exact"), | ||||||
|  |         lambda: line.expect("name-re"), | ||||||
|  |         lambda: line.expect("name"), | ||||||
|  |         lambda: line.expect("re"), | ||||||
|  |         lambda: line.expect(""), | ||||||
|  |     ], "Expected arrow name") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_arrow_head(line: Line) -> ArrowHead: | ||||||
|  |     return line.one_of([ | ||||||
|  |         lambda: line.expect_with(">>", ArrowHead.SEQUENCE), | ||||||
|  |         lambda: line.expect_with(">", ArrowHead.NORMAL), | ||||||
|  |     ], "Expected arrow head") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_eol(line: Line) -> None: | ||||||
|  |     if line.peek(): | ||||||
|  |         raise RuleParseError(line, "Expected end of line") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_rule(line: Line) -> Rule: | ||||||
|  |     parse_zero_or_more_spaces(line) | ||||||
|  |     left_index = line.index | ||||||
|  |     left = parse_left(line) | ||||||
|  |  | ||||||
|  |     parse_one_or_more_spaces(line) | ||||||
|  |  | ||||||
|  |     line.expect("-") | ||||||
|  |     name = parse_arrow_name(line) | ||||||
|  |     line.expect("-") | ||||||
|  |     head = parse_arrow_head(line) | ||||||
|  |  | ||||||
|  |     right_index = line.index | ||||||
|  |     right: RightSide | ||||||
|  |     try: | ||||||
|  |         parse_zero_or_more_spaces(line) | ||||||
|  |         parse_eol(line) | ||||||
|  |         right = Empty() | ||||||
|  |     except RuleParseError: | ||||||
|  |         line.index = right_index | ||||||
|  |         parse_one_or_more_spaces(line) | ||||||
|  |         right = parse_right(line) | ||||||
|  |         parse_eol(line) | ||||||
|  |  | ||||||
|  |     return Rule(left, left_index, name, head, right, right_index) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_transformation(line: Line) -> Transformation: | ||||||
|  |     rule = parse_rule(line) | ||||||
|  |  | ||||||
|  |     if rule.name == "": | ||||||
|  |         return RenamingParentsTf(ExactTf(rule)) | ||||||
|  |     elif rule.name == "exact": | ||||||
|  |         return ExactTf(rule) | ||||||
|  |     elif rule.name == "name": | ||||||
|  |         if len(PurePath(rule.left).parts) > 1: | ||||||
|  |             line.index = rule.left_index | ||||||
|  |             raise RuleParseError(line, "Expected name, not multiple segments") | ||||||
|  |         return RenamingPartsTf(ExactTf(rule)) | ||||||
|  |     elif rule.name == "re": | ||||||
|  |         return RenamingParentsTf(ExactReTf(rule)) | ||||||
|  |     elif rule.name == "exact-re": | ||||||
|  |         return ExactReTf(rule) | ||||||
|  |     elif rule.name == "name-re": | ||||||
|  |         return RenamingPartsTf(ExactReTf(rule)) | ||||||
|  |     else: | ||||||
|  |         raise RuntimeError(f"Invalid arrow name {rule.name!r}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Transformer: | ||||||
|  |     def __init__(self, rules: str): | ||||||
|  |         """ | ||||||
|  |         May throw a RuleParseException. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self._tfs = [] | ||||||
|  |         for i, line in enumerate(rules.split("\n")): | ||||||
|  |             line = line.strip() | ||||||
|  |             if line: | ||||||
|  |                 tf = parse_transformation(Line(line, i)) | ||||||
|  |                 self._tfs.append((line, tf)) | ||||||
|  |  | ||||||
|  |     def transform(self, path: PurePath) -> Optional[PurePath]: | ||||||
|  |         for i, (line, tf) in enumerate(self._tfs): | ||||||
|  |             log.explain(f"Testing rule {i+1}: {line}") | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 result = tf.transform(path) | ||||||
|  |             except Exception as e: | ||||||
|  |                 log.warn(f"Error while testing rule {i+1}: {line}") | ||||||
|  |                 log.warn_contd(str(e)) | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             if not result: | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             if isinstance(result, Ignored): | ||||||
|  |                 log.explain("Match found, path ignored") | ||||||
|  |                 return None | ||||||
|  |  | ||||||
|  |             if tf.rule.head == ArrowHead.NORMAL: | ||||||
|  |                 log.explain(f"Match found, transformed path to {fmt_path(result.path)}") | ||||||
|  |                 path = result.path | ||||||
|  |                 break | ||||||
|  |             elif tf.rule.head == ArrowHead.SEQUENCE: | ||||||
|  |                 log.explain(f"Match found, updated path to {fmt_path(result.path)}") | ||||||
|  |                 path = result.path | ||||||
|  |             else: | ||||||
|  |                 raise RuntimeError(f"Invalid transform result of type {type(result)}: {result}") | ||||||
|  |  | ||||||
|  |         log.explain(f"Final result: {fmt_path(path)}") | ||||||
|  |         return path | ||||||
							
								
								
									
										208
									
								
								PFERD/utils.py
									
									
									
									
									
								
							
							
						
						
									
										208
									
								
								PFERD/utils.py
									
									
									
									
									
								
							| @@ -1,98 +1,144 @@ | |||||||
| """ | import asyncio | ||||||
| A few utility bobs and bits. | import getpass | ||||||
| """ | import sys | ||||||
|  | import threading | ||||||
| import re | from abc import ABC, abstractmethod | ||||||
|  | from contextlib import AsyncExitStack | ||||||
| from pathlib import Path, PurePath | from pathlib import Path, PurePath | ||||||
| from typing import Optional, Tuple, Union | from types import TracebackType | ||||||
|  | from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar | ||||||
|  | from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit | ||||||
|  |  | ||||||
| import bs4 | import bs4 | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .progress import ProgressSettings, progress_for, size_from_headers | T = TypeVar("T") | ||||||
|  |  | ||||||
| PathLike = Union[PurePath, str, Tuple[str, ...]] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_path(pathlike: PathLike) -> Path: | async def in_daemon_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: | ||||||
|  |     loop = asyncio.get_running_loop() | ||||||
|  |     future: asyncio.Future[T] = asyncio.Future() | ||||||
|  |  | ||||||
|  |     def thread_func() -> None: | ||||||
|  |         result = func() | ||||||
|  |         loop.call_soon_threadsafe(future.set_result, result) | ||||||
|  |  | ||||||
|  |     threading.Thread(target=thread_func, daemon=True).start() | ||||||
|  |  | ||||||
|  |     return await future | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def ainput(prompt: str) -> str: | ||||||
|  |     return await in_daemon_thread(lambda: input(prompt)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def agetpass(prompt: str) -> str: | ||||||
|  |     return await in_daemon_thread(lambda: getpass.getpass(prompt)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: | ||||||
|     """ |     """ | ||||||
|     Convert a given PathLike into a Path. |     Asks the user a yes/no question and returns their choice. | ||||||
|     """ |  | ||||||
|     if isinstance(pathlike, tuple): |  | ||||||
|         return Path(*pathlike) |  | ||||||
|     return Path(pathlike) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| Regex = Union[str, re.Pattern] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_pattern(regex: Regex) -> re.Pattern: |  | ||||||
|     """ |  | ||||||
|     Convert a regex to a re.Pattern. |  | ||||||
|     """ |  | ||||||
|     if isinstance(regex, re.Pattern): |  | ||||||
|         return regex |  | ||||||
|     return re.compile(regex) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def soupify(response: requests.Response) -> bs4.BeautifulSoup: |  | ||||||
|     """ |  | ||||||
|     Wrap a requests response in a bs4 object. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     return bs4.BeautifulSoup(response.text, "html.parser") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def stream_to_path( |  | ||||||
|         response: requests.Response, |  | ||||||
|         target: Path, |  | ||||||
|         progress_name: Optional[str] = None, |  | ||||||
|         chunk_size: int = 1024 ** 2 |  | ||||||
| ) -> None: |  | ||||||
|     """ |  | ||||||
|     Download a requests response content to a file by streaming it. This |  | ||||||
|     function avoids excessive memory usage when downloading large files. The |  | ||||||
|     chunk_size is in bytes. |  | ||||||
|  |  | ||||||
|     If progress_name is None, no progress bar will be shown. Otherwise a progress |  | ||||||
|     bar will appear, if the download is bigger than an internal threshold. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     with response: |  | ||||||
|         length = size_from_headers(response) |  | ||||||
|         if progress_name and length and int(length) > 1024 * 1024 * 10:  # 10 MiB |  | ||||||
|             settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length) |  | ||||||
|         else: |  | ||||||
|             settings = None |  | ||||||
|  |  | ||||||
|         with open(target, 'wb') as file_descriptor: |  | ||||||
|             with progress_for(settings) as progress: |  | ||||||
|                 for chunk in response.iter_content(chunk_size=chunk_size): |  | ||||||
|                     file_descriptor.write(chunk) |  | ||||||
|                     progress.advance(len(chunk)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool: |  | ||||||
|     """ |  | ||||||
|     Prompts the user a yes/no question and returns their choice. |  | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     if default is True: |     if default is True: | ||||||
|         prompt = "[Y/n]" |         query += " [Y/n] " | ||||||
|     elif default is False: |     elif default is False: | ||||||
|         prompt = "[y/N]" |         query += " [y/N] " | ||||||
|     else: |     else: | ||||||
|         prompt = "[y/n]" |         query += " [y/n] " | ||||||
|  |  | ||||||
|     text = f"{question} {prompt} " |  | ||||||
|     wrong_reply = "Please reply with 'yes'/'y' or 'no'/'n'." |  | ||||||
|  |  | ||||||
|     while True: |     while True: | ||||||
|         response = input(text).strip().lower() |         response = (await ainput(query)).strip().lower() | ||||||
|         if response in {"yes", "ye", "y"}: |         if response == "y": | ||||||
|             return True |             return True | ||||||
|         if response in {"no", "n"}: |         elif response == "n": | ||||||
|             return False |             return False | ||||||
|         if response == "" and default is not None: |         elif response == "" and default is not None: | ||||||
|             return default |             return default | ||||||
|         print(wrong_reply) |  | ||||||
|  |         print("Please answer with 'y' or 'n'.") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def soupify(data: bytes) -> bs4.BeautifulSoup: | ||||||
|  |     """ | ||||||
|  |     Parses HTML to a beautifulsoup object. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     return bs4.BeautifulSoup(data, "html.parser") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def url_set_query_param(url: str, param: str, value: str) -> str: | ||||||
|  |     """ | ||||||
|  |     Set a query parameter in an url, overwriting existing ones with the same name. | ||||||
|  |     """ | ||||||
|  |     scheme, netloc, path, query, fragment = urlsplit(url) | ||||||
|  |     query_parameters = parse_qs(query) | ||||||
|  |     query_parameters[param] = [value] | ||||||
|  |     new_query_string = urlencode(query_parameters, doseq=True) | ||||||
|  |  | ||||||
|  |     return urlunsplit((scheme, netloc, path, new_query_string, fragment)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def url_set_query_params(url: str, params: Dict[str, str]) -> str: | ||||||
|  |     """ | ||||||
|  |     Sets multiple query parameters in an url, overwriting existing ones. | ||||||
|  |     """ | ||||||
|  |     result = url | ||||||
|  |  | ||||||
|  |     for key, val in params.items(): | ||||||
|  |         result = url_set_query_param(result, key, val) | ||||||
|  |  | ||||||
|  |     return result | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def str_path(path: PurePath) -> str: | ||||||
|  |     if not path.parts: | ||||||
|  |         return "." | ||||||
|  |     return "/".join(path.parts) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fmt_path(path: PurePath) -> str: | ||||||
|  |     return repr(str_path(path)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fmt_real_path(path: Path) -> str: | ||||||
|  |     return repr(str(path.absolute())) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ReusableAsyncContextManager(ABC, Generic[T]): | ||||||
|  |     def __init__(self) -> None: | ||||||
|  |         self._active = False | ||||||
|  |         self._stack = AsyncExitStack() | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     async def _on_aenter(self) -> T: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     async def __aenter__(self) -> T: | ||||||
|  |         if self._active: | ||||||
|  |             raise RuntimeError("Nested or otherwise concurrent usage is not allowed") | ||||||
|  |  | ||||||
|  |         self._active = True | ||||||
|  |         await self._stack.__aenter__() | ||||||
|  |  | ||||||
|  |         # See https://stackoverflow.com/a/13075071 | ||||||
|  |         try: | ||||||
|  |             result: T = await self._on_aenter() | ||||||
|  |         except:  # noqa: E722 do not use bare 'except' | ||||||
|  |             if not await self.__aexit__(*sys.exc_info()): | ||||||
|  |                 raise | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     async def __aexit__( | ||||||
|  |             self, | ||||||
|  |             exc_type: Optional[Type[BaseException]], | ||||||
|  |             exc_value: Optional[BaseException], | ||||||
|  |             traceback: Optional[TracebackType], | ||||||
|  |     ) -> Optional[bool]: | ||||||
|  |         if not self._active: | ||||||
|  |             raise RuntimeError("__aexit__ called too many times") | ||||||
|  |  | ||||||
|  |         result = await self._stack.__aexit__(exc_type, exc_value, traceback) | ||||||
|  |         self._active = False | ||||||
|  |         return result | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | NAME = "PFERD" | ||||||
|  | VERSION = "3.8.2" | ||||||
							
								
								
									
										361
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										361
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,254 +2,157 @@ | |||||||
|  |  | ||||||
| **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien | **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien | ||||||
|  |  | ||||||
| - [Quickstart with `sync_url`](#quickstart-with-sync_url) | Other resources: | ||||||
| - [Installation](#installation) |  | ||||||
|     - [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210) |  | ||||||
| - [Example setup](#example-setup) |  | ||||||
| - [Usage](#usage) |  | ||||||
|     - [General concepts](#general-concepts) |  | ||||||
|     - [Constructing transforms](#constructing-transforms) |  | ||||||
|         - [Transform creators](#transform-creators) |  | ||||||
|         - [Transform combinators](#transform-combinators) |  | ||||||
|     - [A short, but commented example](#a-short-but-commented-example) |  | ||||||
|  |  | ||||||
| ## Quickstart with `sync_url` | - [Config file format](CONFIG.md) | ||||||
|  | - [Changelog](CHANGELOG.md) | ||||||
| The `sync_url` program allows you to just synchronize a given ILIAS URL (of a | - [Development Guide](DEV.md) | ||||||
| course, a folder, your personal desktop, etc.) without any extra configuration |  | ||||||
| or setting up. Download the program, open ILIAS, copy the URL from the address |  | ||||||
| bar and pass it to sync_url. |  | ||||||
|  |  | ||||||
| It bundles everything it needs in one executable and is easy to |  | ||||||
| use, but doesn't expose all the configuration options and tweaks a full install |  | ||||||
| does. |  | ||||||
|  |  | ||||||
| 1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest). |  | ||||||
| 2. Recognize that you most likely need to enclose the URL in `""` quotes to prevent your shell from interpreting `&` and other symbols |  | ||||||
| 3. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option.   |  | ||||||
|   If you are on **Linux/Mac**, you need to *make the file executable* using `chmod +x <file>`.   |  | ||||||
|   If you are on **Mac**, you need to allow this unverified program to run (see e.g. [here](https://www.switchingtomac.com/tutorials/osx/how-to-run-unverified-apps-on-macos/)) |  | ||||||
|  |  | ||||||
| ## Installation | ## Installation | ||||||
|  |  | ||||||
| Ensure that you have at least Python 3.8 installed. | ### Direct download | ||||||
|  |  | ||||||
|  | Binaries for Linux, Windows and Mac can be downloaded directly from the | ||||||
|  | [latest release](https://github.com/Garmelon/PFERD/releases/latest). | ||||||
|  |  | ||||||
|  | ### With pip | ||||||
|  |  | ||||||
|  | Ensure you have at least Python 3.11 installed. Run the following command to | ||||||
|  | install PFERD or upgrade it to the latest version: | ||||||
|  |  | ||||||
| To install PFERD or update your installation to the latest version, run this |  | ||||||
| wherever you want to install or have already installed PFERD: |  | ||||||
| ``` | ``` | ||||||
| $ pip install git+https://github.com/Garmelon/PFERD@v2.4.5 | $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| The use of [venv] is recommended. | The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. | ||||||
|  |  | ||||||
| [venv]: https://docs.python.org/3/library/venv.html | ### With package managers | ||||||
|  |  | ||||||
| ### Upgrading from 2.0.0 to 2.1.0+ | Unofficial packages are available for: | ||||||
|  | - [AUR](https://aur.archlinux.org/packages/pferd) | ||||||
|  | - [brew](https://formulae.brew.sh/formula/pferd) | ||||||
|  | - [conda-forge](https://github.com/conda-forge/pferd-feedstock) | ||||||
|  | - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) | ||||||
|  | - [PyPi](https://pypi.org/project/pferd) | ||||||
|  |  | ||||||
| - The `IliasDirectoryType` type was renamed to `IliasElementType` and is now far more detailed. | See also PFERD's [repology page](https://repology.org/project/pferd/versions). | ||||||
|   The new values are: `REGULAR_FOLDER`, `VIDEO_FOLDER`, `EXERCISE_FOLDER`, `REGULAR_FILE`, `VIDEO_FILE`, `FORUM`, `EXTERNAL_LINK`. |  | ||||||
| - Forums and external links are skipped automatically if you use the `kit_ilias` helper. |  | ||||||
|  |  | ||||||
| ## Example setup | ## Basic usage | ||||||
|  |  | ||||||
| In this example, `python3` refers to at least Python 3.8. | PFERD can be run directly from the command line with no config file. Run `pferd | ||||||
|  | -h` to get an overview of available commands and options. Run `pferd <command> | ||||||
|  | -h` to see which options a command has. | ||||||
|  |  | ||||||
|  | For example, you can download your personal desktop from the KIT ILIAS like | ||||||
|  | this: | ||||||
|  |  | ||||||
| A full example setup and initial use could look like: |  | ||||||
| ``` | ``` | ||||||
| $ mkdir Vorlesungen | $ pferd kit-ilias-web desktop <output_directory> | ||||||
| $ cd Vorlesungen |  | ||||||
| $ python3 -m venv .venv |  | ||||||
| $ .venv/bin/activate |  | ||||||
| $ pip install git+https://github.com/Garmelon/PFERD@v2.4.5 |  | ||||||
| $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.5/example_config.py |  | ||||||
| $ python3 example_config.py |  | ||||||
| $ deactivate |  | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| Subsequent runs of the program might look like: | Also, you can download most ILIAS pages directly like this: | ||||||
|  |  | ||||||
| ``` | ``` | ||||||
| $ cd Vorlesungen | $ pferd kit-ilias-web <url> <output_directory> | ||||||
| $ .venv/bin/activate |  | ||||||
| $ python3 example_config.py |  | ||||||
| $ deactivate |  | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| If you just want to get started and crawl *your entire ILIAS Desktop* instead | PFERD supports other ILIAS instances as well, using the `ilias-web` crawler (see | ||||||
| of a given set of courses, please replace `example_config.py` with | the [config section on `ilias-web`](CONFIG.md#the-ilias-web-crawler) for more | ||||||
| `example_config_personal_desktop.py` in all of the instructions below (`curl` call and | detail on the `base-url` and `client-id` parameters): | ||||||
| `python3` run command). |  | ||||||
|  |  | ||||||
| ## Usage | ``` | ||||||
|  | $ pferd ilias-web \ | ||||||
| ### General concepts |     --base-url https://ilias.my-university.example \ | ||||||
|  |     --client-id My_University desktop \ | ||||||
| A PFERD config is a normal python file that starts multiple *synchronizers* |     <output_directory> | ||||||
| which do all the heavy lifting. While you can create and wire them up manually, | ``` | ||||||
| you are encouraged to use the helper methods provided in `PFERD.Pferd`. |  | ||||||
|  | However, the CLI only lets you download a single thing at a time, and the | ||||||
| The synchronizers take some input arguments specific to their service and a | resulting command can grow long quite quickly. Because of this, PFERD can also | ||||||
| *transform*. The transform receives the computed path of an element in ILIAS and | be used with a config file. | ||||||
| can return either an output path (so you can rename files or move them around as |  | ||||||
| you wish) or `None` if you do not want to save the given file. | To get started, just take a command you've been using and add `--dump-config` | ||||||
|  | directly after `pferd`, like this: | ||||||
| Additionally the ILIAS synchronizer allows you to define a *crawl filter*. This |  | ||||||
| filter also receives the computed path as the input, but is only called for | ``` | ||||||
| *directories*. If you return `True`, the directory will be crawled and | $ pferd --dump-config kit-ilias-web <url> <output_directory> | ||||||
| searched. If you return `False` the directory will be ignored and nothing in it | ``` | ||||||
| will be passed to the transform. |  | ||||||
|  | This will make PFERD write its current configuration to its default config file | ||||||
| ### Constructing transforms | path. You can then run `pferd` without a command and it will execute the config | ||||||
|  | file. Alternatively, you can use `--dump-config-to` and specify a path yourself. | ||||||
| While transforms are just normal python functions, writing them by hand can | Using `--dump-config-to -` will print the configuration to stdout instead of a | ||||||
| quickly become tedious. In order to help you with writing your own transforms | file, which is a good way to see what is actually going on when using a CLI | ||||||
| and filters, PFERD defines a few useful transform creators and combinators in | command. | ||||||
| the `PFERD.transform` module: |  | ||||||
|  | Another good way to see what PFERD is doing is the `--explain` option. When | ||||||
| #### Transform creators | enabled, PFERD explains in detail what it is doing and why. This can help with | ||||||
|  | debugging your own config. | ||||||
| These methods let you create a few basic transform building blocks: |  | ||||||
|  | If you don't want to run all crawlers from your config file, you can specify the | ||||||
| - **`glob(glob)`**   | crawlers you want to run with `--crawler` or `-C`, like this: | ||||||
|   Creates a transform that returns the unchanged path if the glob matches the path and `None` otherwise. |  | ||||||
|   See also [Path.match].   | ``` | ||||||
|   Example: `glob("Übung/*.pdf")` | $ pferd -C crawler1 -C crawler2 | ||||||
| - **`predicate(pred)`**   | ``` | ||||||
|   Creates a transform that returns the unchanged path if `pred(path)` returns a truthy value. |  | ||||||
|   Returns `None` otherwise.   | ## Advanced usage | ||||||
|   Example: `predicate(lambda path: len(path.parts) == 3)` |  | ||||||
| - **`move_dir(source, target)`**   | PFERD supports lots of different options. For example, you can configure PFERD | ||||||
|   Creates a transform that moves all files from the `source` to the `target` directory.   | to [use your system's keyring](CONFIG.md#the-keyring-authenticator) instead of | ||||||
|   Example: `move_dir("Übung/", "Blätter/")` | prompting you for your username and password. PFERD also supports | ||||||
| - **`move(source, target)`**   | [transformation rules](CONFIG.md#transformation-rules) that let you rename or | ||||||
|   Creates a transform that moves the `source` file to `target`.   | exclude certain files. | ||||||
|   Example: `move("Vorlesung/VL02_Automten.pdf", "Vorlesung/VL02_Automaten.pdf")` |  | ||||||
| - **`rename(source, target)`**   | For more details, see the comprehensive [config format documentation](CONFIG.md). | ||||||
|   Creates a transform that renames all files named `source` to `target`. |  | ||||||
|   This transform works on the file names, not paths, and thus works no matter where the file is located.   | ## Example | ||||||
|   Example: `rename("VL02_Automten.pdf", "VL02_Automaten.pdf")` |  | ||||||
| - **`re_move(regex, target)`**   | This example downloads a few courses from the KIT ILIAS with a common keyring | ||||||
|   Creates a transform that moves all files matching `regex` to `target`. | authenticator. It reorganizes and ignores some files. | ||||||
|   The transform `str.format` on the `target` string with the contents of the capturing groups before returning it. |  | ||||||
|   The capturing groups can be accessed via their index. | ```ini | ||||||
|   See also [Match.group].   | [DEFAULT] | ||||||
|   Example: `re_move(r"Übung/Blatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf")` | # All paths will be relative to this. | ||||||
| - **`re_rename(regex, target)`**   | # The crawler output directories will be <working_dir>/Foo and <working_dir>/Bar. | ||||||
|   Creates a transform that renames all files matching `regex` to `target`. | working_dir = ~/stud | ||||||
|   This transform works on the file names, not paths, and thus works no matter where the file is located.   | # If files vanish from ILIAS the local files are not deleted, allowing us to | ||||||
|   Example: `re_rename(r"VL(\d+)(.*)\.pdf", "Vorlesung_Nr_{1}__{2}.pdf")` | # take a look at them before deleting them ourselves. | ||||||
|  | on_conflict = no-delete | ||||||
| All movement or rename transforms above return `None` if a file doesn't match |  | ||||||
| their movement or renaming criteria. This enables them to be used as building | [auth:ilias] | ||||||
| blocks to build up more complex transforms. | type = keyring | ||||||
|  | username = foo | ||||||
| In addition, `PFERD.transform` also defines the `keep` transform which returns its input path unchanged. |  | ||||||
| This behaviour can be very useful when creating more complex transforms. | [crawl:Foo] | ||||||
| See below for example usage. | type = kit-ilias-web | ||||||
|  | auth = auth:ilias | ||||||
| [Path.match]: https://docs.python.org/3/library/pathlib.html#pathlib.Path.match | # Crawl a course by its ID (found as `ref_id=ID` in the URL) | ||||||
| [Match.group]: https://docs.python.org/3/library/re.html#re.Match.group | target = 1234567 | ||||||
|  |  | ||||||
| #### Transform combinators | # Plaintext files are easier to read by other tools | ||||||
|  | links = plaintext | ||||||
| These methods let you combine transforms into more complex transforms: |  | ||||||
|  | transform = | ||||||
| - **`optionally(transform)`**   |   # Ignore unneeded folders | ||||||
|   Wraps a given transform and returns its result if it is not `None`. |   Online-Tests --> ! | ||||||
|   Otherwise returns the input path unchanged. |   Vorlesungswerbung --> ! | ||||||
|   See below for example usage. |  | ||||||
| * **`do(transforms)`**   |   # Rename folders | ||||||
|   Accepts a series of transforms and applies them in the given order to the result of the previous one. |   Lehrbücher --> Vorlesung | ||||||
|   If any transform returns `None`, `do` short-circuits and also returns `None`. |   # Note the ">>" arrow head which lets us apply further rules to files moved to "Übung" | ||||||
|   This can be used to perform multiple renames in a row: |   Übungsunterlagen -->> Übung | ||||||
|   ```py |  | ||||||
|   do( |   # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly | ||||||
|       # Move them |   "Übung/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf | ||||||
|       move_dir("Vorlesungsmaterial/Vorlesungsvideos/", "Vorlesung/Videos/"), |   # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly | ||||||
|       # Fix extensions (if they have any) |   "Übung/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf | ||||||
|       optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")), |  | ||||||
|       # Remove the 'dbs' prefix (if they have any) |   # The course has nested folders with the same name - flatten them | ||||||
|       optionally(re_rename("(?i)dbs-(.+)", "{1}")), |   "Übung/(.+?)/\\1" -re-> Übung/{g1} | ||||||
|   ) |  | ||||||
|   ``` | [crawl:Bar] | ||||||
| - **`attempt(transforms)`**   | type = kit-ilias-web | ||||||
|   Applies the passed transforms in the given order until it finds one that does not return `None`. | auth = auth:ilias | ||||||
|   If it does not find any, it returns `None`. | target = 1337420 | ||||||
|   This can be used to give a list of possible transformations and automatically pick the first one that fits: |  | ||||||
|   ```py |  | ||||||
|   attempt( |  | ||||||
|       # Move all videos. If a video is passed in, this `re_move` will succeed |  | ||||||
|       # and attempt short-circuits with the result. |  | ||||||
|       re_move(r"Vorlesungsmaterial/.*/(.+?)\.mp4", "Vorlesung/Videos/{1}.mp4"), |  | ||||||
|       # Move the whole folder to a nicer name - now without any mp4! |  | ||||||
|       move_dir("Vorlesungsmaterial/", "Vorlesung/"), |  | ||||||
|       # If we got another file, keep it. |  | ||||||
|       keep, |  | ||||||
|   ) |  | ||||||
|   ``` |  | ||||||
|  |  | ||||||
| All of these combinators are used in the provided example configs, if you want |  | ||||||
| to see some more real-life usages. |  | ||||||
|  |  | ||||||
| ### A short, but commented example |  | ||||||
|  |  | ||||||
| ```py |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
| from PFERD import Pferd |  | ||||||
| from PFERD.ilias import IliasElementType |  | ||||||
| from PFERD.transform import * |  | ||||||
|  |  | ||||||
| # This filter will later be used by the ILIAS crawler to decide whether it |  | ||||||
| # should crawl a directory (or directory-like structure). |  | ||||||
| def filter_course(path: PurePath, type: IliasElementType) -> bool: |  | ||||||
|     # Note that glob returns a Transform, which is a function from PurePath -> |  | ||||||
|     # Optional[PurePath]. Because of this, we need to apply the result of |  | ||||||
|     # 'glob' to our input path. The returned value will be truthy (a Path) if |  | ||||||
|     # the transform succeeded, or `None` if it failed. |  | ||||||
|  |  | ||||||
|     # We need to crawl the 'Tutorien' folder as it contains one that we want. |  | ||||||
|     if glob("Tutorien/")(path): |  | ||||||
|         return True |  | ||||||
|     # If we found 'Tutorium 10', keep it! |  | ||||||
|     if glob("Tutorien/Tutorium 10")(path): |  | ||||||
|         return True |  | ||||||
|     # Discard all other folders inside 'Tutorien' |  | ||||||
|     if glob("Tutorien/*")(path): |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     # All other dirs (including subdirs of 'Tutorium 10') should be searched :) |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # This transform will later be used to rename a few files. It can also be used |  | ||||||
| # to ignore some files. |  | ||||||
| transform_course = attempt( |  | ||||||
|     # We don't care about the other tuts and would instead prefer a cleaner |  | ||||||
|     # directory structure. |  | ||||||
|     move_dir("Tutorien/Tutorium 10/", "Tutorium/"), |  | ||||||
|     # We don't want to modify any other files, so we're going to keep them |  | ||||||
|     # exactly as they are. |  | ||||||
|     keep |  | ||||||
| ) |  | ||||||
|  |  | ||||||
| # Enable and configure the text output. Needs to be called before calling any |  | ||||||
| # other PFERD methods. |  | ||||||
| Pferd.enable_logging() |  | ||||||
| # Create a Pferd instance rooted in the same directory as the script file. This |  | ||||||
| # is not a test run, so files will be downloaded (default, can be omitted). |  | ||||||
| pferd = Pferd(Path(__file__).parent, test_run=False) |  | ||||||
|  |  | ||||||
| # Use the ilias_kit helper to synchronize an ILIAS course |  | ||||||
| pferd.ilias_kit( |  | ||||||
|     # The directory that all of the downloaded files should be placed in |  | ||||||
|     "My_cool_course/", |  | ||||||
|     # The course ID (found in the URL when on the course page in ILIAS) |  | ||||||
|     "course id", |  | ||||||
|     # A path to a cookie jar. If you synchronize multiple ILIAS courses, |  | ||||||
|     # setting this to a common value requires you to only log in once. |  | ||||||
|     cookies=Path("ilias_cookies.txt"), |  | ||||||
|     # A transform can rename, move or filter out certain files |  | ||||||
|     transform=transform_course, |  | ||||||
|     # A crawl filter limits what paths the cralwer searches |  | ||||||
|     dir_filter=filter_course, |  | ||||||
| ) |  | ||||||
| ``` | ``` | ||||||
|   | |||||||
| @@ -1,131 +0,0 @@ | |||||||
| import argparse |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
|  |  | ||||||
| from PFERD import Pferd |  | ||||||
| from PFERD.ilias import IliasElementType |  | ||||||
| from PFERD.transform import (attempt, do, glob, keep, move, move_dir, |  | ||||||
|                              optionally, re_move, re_rename) |  | ||||||
|  |  | ||||||
| tf_ss_2020_numerik = attempt( |  | ||||||
|     re_move(r"Übungsblätter/(\d+)\. Übungsblatt/.*", "Blätter/Blatt_{1:0>2}.pdf"), |  | ||||||
|     keep, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| tf_ss_2020_db = attempt( |  | ||||||
|     move_dir("Begrüßungsvideo/", "Vorlesung/Videos/"), |  | ||||||
|     do( |  | ||||||
|         move_dir("Vorlesungsmaterial/Vorlesungsvideos/", "Vorlesung/Videos/"), |  | ||||||
|         optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")), |  | ||||||
|         optionally(re_rename("(?i)dbs-(.+)", "{1}")), |  | ||||||
|     ), |  | ||||||
|     move_dir("Vorlesungsmaterial/", "Vorlesung/"), |  | ||||||
|     keep, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| tf_ss_2020_rechnernetze = attempt( |  | ||||||
|     re_move(r"Vorlesungsmaterial/.*/(.+?)\.mp4", "Vorlesung/Videos/{1}.mp4"), |  | ||||||
|     move_dir("Vorlesungsmaterial/", "Vorlesung/"), |  | ||||||
|     keep, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| tf_ss_2020_sicherheit = attempt( |  | ||||||
|     move_dir("Vorlesungsvideos/", "Vorlesung/Videos/"), |  | ||||||
|     move_dir("Übungsvideos/", "Übung/Videos/"), |  | ||||||
|     re_move(r"VL(.*)\.pdf", "Vorlesung/{1}.pdf"), |  | ||||||
|     re_move(r"Übungsblatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf"), |  | ||||||
|     move("Chiffrat.txt", "Blätter/Blatt_01_Chiffrat.txt"), |  | ||||||
|     keep, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| tf_ss_2020_pg = attempt( |  | ||||||
|     move_dir("Vorlesungsaufzeichnungen/", "Vorlesung/Videos/"), |  | ||||||
|     move_dir("Vorlesungsmaterial/", "Vorlesung/"), |  | ||||||
|     re_move(r"Übungen/uebungsblatt(\d+).pdf", "Blätter/Blatt_{1:0>2}.pdf"), |  | ||||||
|     keep, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def df_ss_2020_or1(path: PurePath, _type: IliasElementType) -> bool: |  | ||||||
|     if glob("Tutorien/")(path): |  | ||||||
|         return True |  | ||||||
|     if glob("Tutorien/Tutorium 10, dienstags 15:45 Uhr/")(path): |  | ||||||
|         return True |  | ||||||
|     if glob("Tutorien/*")(path): |  | ||||||
|         return False |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
|  |  | ||||||
| tf_ss_2020_or1 = attempt( |  | ||||||
|     move_dir("Vorlesung/Unbeschriebene Folien/", "Vorlesung/Folien/"), |  | ||||||
|     move_dir("Video zur Organisation/", "Vorlesung/Videos/"), |  | ||||||
|     keep, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def main() -> None: |  | ||||||
|     parser = argparse.ArgumentParser() |  | ||||||
|     parser.add_argument("--test-run", action="store_true") |  | ||||||
|     parser.add_argument("synchronizers", nargs="*") |  | ||||||
|     args = parser.parse_args() |  | ||||||
|  |  | ||||||
|     pferd = Pferd(Path(__file__).parent, test_run=args.test_run) |  | ||||||
|     pferd.enable_logging() |  | ||||||
|  |  | ||||||
|     if not args.synchronizers or "numerik" in args.synchronizers: |  | ||||||
|         pferd.ilias_kit( |  | ||||||
|             target="Numerik", |  | ||||||
|             course_id="1083036", |  | ||||||
|             transform=tf_ss_2020_numerik, |  | ||||||
|             cookies="ilias_cookies.txt", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     if not args.synchronizers or "db" in args.synchronizers: |  | ||||||
|         pferd.ilias_kit( |  | ||||||
|             target="DB", |  | ||||||
|             course_id="1101554", |  | ||||||
|             transform=tf_ss_2020_db, |  | ||||||
|             cookies="ilias_cookies.txt", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     if not args.synchronizers or "rechnernetze" in args.synchronizers: |  | ||||||
|         pferd.ilias_kit( |  | ||||||
|             target="Rechnernetze", |  | ||||||
|             course_id="1099996", |  | ||||||
|             transform=tf_ss_2020_rechnernetze, |  | ||||||
|             cookies="ilias_cookies.txt", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     if not args.synchronizers or "sicherheit" in args.synchronizers: |  | ||||||
|         pferd.ilias_kit( |  | ||||||
|             target="Sicherheit", |  | ||||||
|             course_id="1101980", |  | ||||||
|             transform=tf_ss_2020_sicherheit, |  | ||||||
|             cookies="ilias_cookies.txt", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     if not args.synchronizers or "pg" in args.synchronizers: |  | ||||||
|         pferd.ilias_kit( |  | ||||||
|             target="PG", |  | ||||||
|             course_id="1106095", |  | ||||||
|             transform=tf_ss_2020_pg, |  | ||||||
|             cookies="ilias_cookies.txt", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     if not args.synchronizers or "or1" in args.synchronizers: |  | ||||||
|         pferd.ilias_kit( |  | ||||||
|             target="OR1", |  | ||||||
|             course_id="1105941", |  | ||||||
|             dir_filter=df_ss_2020_or1, |  | ||||||
|             transform=tf_ss_2020_or1, |  | ||||||
|             cookies="ilias_cookies.txt", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     # Prints a summary listing all new, modified or deleted files |  | ||||||
|     pferd.print_summary() |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     main() |  | ||||||
| @@ -1,38 +0,0 @@ | |||||||
| """ |  | ||||||
| This is a small config that just crawls the ILIAS Personal Desktop. |  | ||||||
| It does not filter or rename anything, it just gobbles up everything it can find. |  | ||||||
|  |  | ||||||
| Note that this still includes a test-run switch, so you can see what it *would* download. |  | ||||||
| You can enable that with the "--test-run" command line switch, |  | ||||||
| i. e. "python3 example_config_minimal.py --test-run". |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import argparse |  | ||||||
| from pathlib import Path |  | ||||||
|  |  | ||||||
| from PFERD import Pferd |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def main() -> None: |  | ||||||
|     # Parse command line arguments |  | ||||||
|     parser = argparse.ArgumentParser() |  | ||||||
|     parser.add_argument("--test-run", action="store_true") |  | ||||||
|     args = parser.parse_args() |  | ||||||
|  |  | ||||||
|     # Create the Pferd helper instance |  | ||||||
|     pferd = Pferd(Path(__file__).parent, test_run=args.test_run) |  | ||||||
|     pferd.enable_logging() |  | ||||||
|  |  | ||||||
|     # Synchronize the personal desktop into the "ILIAS" directory. |  | ||||||
|     # It saves the cookies, so you only need to log in again when the ILIAS cookies expire. |  | ||||||
|     pferd.ilias_kit_personal_desktop( |  | ||||||
|         "ILIAS", |  | ||||||
|         cookies="ilias_cookies.txt", |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     # Prints a summary listing all new, modified or deleted files |  | ||||||
|     pferd.print_summary() |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     main() |  | ||||||
							
								
								
									
										27
									
								
								flake.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								flake.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | { | ||||||
|  |   "nodes": { | ||||||
|  |     "nixpkgs": { | ||||||
|  |       "locked": { | ||||||
|  |         "lastModified": 1744440957, | ||||||
|  |         "narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=", | ||||||
|  |         "owner": "NixOS", | ||||||
|  |         "repo": "nixpkgs", | ||||||
|  |         "rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d", | ||||||
|  |         "type": "github" | ||||||
|  |       }, | ||||||
|  |       "original": { | ||||||
|  |         "owner": "NixOS", | ||||||
|  |         "ref": "nixos-24.11", | ||||||
|  |         "repo": "nixpkgs", | ||||||
|  |         "type": "github" | ||||||
|  |       } | ||||||
|  |     }, | ||||||
|  |     "root": { | ||||||
|  |       "inputs": { | ||||||
|  |         "nixpkgs": "nixpkgs" | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   }, | ||||||
|  |   "root": "root", | ||||||
|  |   "version": 7 | ||||||
|  | } | ||||||
							
								
								
									
										41
									
								
								flake.nix
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								flake.nix
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | |||||||
|  | { | ||||||
|  |   description = "Tool for downloading course-related files from ILIAS"; | ||||||
|  |  | ||||||
|  |   inputs = { | ||||||
|  |     nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11"; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   outputs = { self, nixpkgs }: | ||||||
|  |     let | ||||||
|  |       # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. | ||||||
|  |       forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; | ||||||
|  |     in | ||||||
|  |     { | ||||||
|  |       packages = forAllSystems (system: | ||||||
|  |         let pkgs = import nixpkgs { inherit system; }; | ||||||
|  |         in | ||||||
|  |         rec { | ||||||
|  |           default = pkgs.python3Packages.buildPythonApplication rec { | ||||||
|  |             pname = "pferd"; | ||||||
|  |             # Performing black magic | ||||||
|  |             # Don't worry, I sacrificed enough goats for the next few years | ||||||
|  |             version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; | ||||||
|  |             format = "pyproject"; | ||||||
|  |  | ||||||
|  |             src = ./.; | ||||||
|  |  | ||||||
|  |             nativeBuildInputs = with pkgs.python3Packages; [ | ||||||
|  |               setuptools | ||||||
|  |             ]; | ||||||
|  |  | ||||||
|  |             propagatedBuildInputs = with pkgs.python3Packages; [ | ||||||
|  |               aiohttp | ||||||
|  |               beautifulsoup4 | ||||||
|  |               rich | ||||||
|  |               keyring | ||||||
|  |               certifi | ||||||
|  |             ]; | ||||||
|  |           }; | ||||||
|  |         }); | ||||||
|  |     }; | ||||||
|  | } | ||||||
							
								
								
									
										7
									
								
								mypy.ini
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								mypy.ini
									
									
									
									
									
								
							| @@ -1,7 +0,0 @@ | |||||||
| [mypy] |  | ||||||
| disallow_untyped_defs = True |  | ||||||
| disallow_incomplete_defs = True |  | ||||||
| no_implicit_optional = True |  | ||||||
|  |  | ||||||
| [mypy-rich.*,bs4] |  | ||||||
| ignore_missing_imports = True |  | ||||||
							
								
								
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | # File used by pyinstaller to create the executable | ||||||
|  |  | ||||||
|  | from PFERD.__main__ import main | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										42
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | |||||||
|  | [build-system] | ||||||
|  | requires = ["setuptools", "wheel"] | ||||||
|  | build-backend = "setuptools.build_meta" | ||||||
|  |  | ||||||
|  | [project] | ||||||
|  | name = "PFERD" | ||||||
|  | dependencies = [ | ||||||
|  |   "aiohttp>=3.8.1", | ||||||
|  |   "beautifulsoup4>=4.10.0", | ||||||
|  |   "rich>=11.0.0", | ||||||
|  |   "keyring>=23.5.0", | ||||||
|  |   "certifi>=2021.10.8" | ||||||
|  | ] | ||||||
|  | dynamic = ["version"] | ||||||
|  | requires-python = ">=3.11" | ||||||
|  |  | ||||||
|  | [project.scripts] | ||||||
|  | pferd = "PFERD.__main__:main" | ||||||
|  |  | ||||||
|  | [tool.setuptools.dynamic] | ||||||
|  | version = {attr = "PFERD.version.VERSION"} | ||||||
|  |  | ||||||
|  | [tool.flake8] | ||||||
|  | max-line-length = 110 | ||||||
|  |  | ||||||
|  | [tool.isort] | ||||||
|  | line_length = 110 | ||||||
|  |  | ||||||
|  | [tool.autopep8] | ||||||
|  | max_line_length = 110 | ||||||
|  | in-place = true | ||||||
|  | recursive = true | ||||||
|  |  | ||||||
|  | [tool.mypy] | ||||||
|  | disallow_any_generics = true | ||||||
|  | disallow_untyped_defs = true | ||||||
|  | disallow_incomplete_defs = true | ||||||
|  | no_implicit_optional = true | ||||||
|  | warn_unused_ignores = true | ||||||
|  | warn_unreachable = true | ||||||
|  | show_error_context = true | ||||||
|  | ignore_missing_imports = true | ||||||
							
								
								
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,5 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | pyinstaller --onefile pferd.py | ||||||
							
								
								
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,111 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  |  | ||||||
|  | import argparse | ||||||
|  | import re | ||||||
|  | import time | ||||||
|  | from subprocess import run | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_changelog(): | ||||||
|  |     with open("CHANGELOG.md") as f: | ||||||
|  |         return list(f) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def extract_changes(lines): | ||||||
|  |     lines = iter(lines) | ||||||
|  |     changes = [] | ||||||
|  |  | ||||||
|  |     # Find "Unreleased" section | ||||||
|  |     for line in lines: | ||||||
|  |         if line.strip() == "## Unreleased": | ||||||
|  |             break | ||||||
|  |     next(lines) | ||||||
|  |  | ||||||
|  |     # Read all lines from that section | ||||||
|  |     for line in lines: | ||||||
|  |         if line.startswith("## "): | ||||||
|  |             # Found the beginning of the next section | ||||||
|  |             break | ||||||
|  |         elif line.startswith("### "): | ||||||
|  |             # Found a heading in the current section | ||||||
|  |             # Remove "#" symbols so git doesn't interpret the line as a comment later | ||||||
|  |             changes.append(line[4:]) | ||||||
|  |         else: | ||||||
|  |             changes.append(line) | ||||||
|  |  | ||||||
|  |     # Remove trailing empty lines | ||||||
|  |     while changes and not changes[-1].strip(): | ||||||
|  |         changes.pop() | ||||||
|  |  | ||||||
|  |     return changes | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def update_version(version): | ||||||
|  |     with open("PFERD/version.py") as f: | ||||||
|  |         text = f.read() | ||||||
|  |  | ||||||
|  |     text = re.sub(r'VERSION = ".*"', f'VERSION = "{version}"', text) | ||||||
|  |  | ||||||
|  |     with open("PFERD/version.py", "w") as f: | ||||||
|  |         f.write(text) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def update_changelog(lines, version, date): | ||||||
|  |     lines = iter(lines) | ||||||
|  |     new_lines = [] | ||||||
|  |  | ||||||
|  |     # Find "Unreleased" section | ||||||
|  |     for line in lines: | ||||||
|  |         new_lines.append(line) | ||||||
|  |         if line.strip() == "## Unreleased": | ||||||
|  |             break | ||||||
|  |  | ||||||
|  |     # Add new heading below that | ||||||
|  |     new_lines.append("\n") | ||||||
|  |     new_lines.append(f"## {version} - {date}\n") | ||||||
|  |  | ||||||
|  |     # Add remaining lines | ||||||
|  |     for line in lines: | ||||||
|  |         new_lines.append(line) | ||||||
|  |  | ||||||
|  |     with open("CHANGELOG.md", "w") as f: | ||||||
|  |         f.write("".join(new_lines)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def commit_changes(version): | ||||||
|  |     run(["git", "add", "CHANGELOG.md", "PFERD/version.py"]) | ||||||
|  |     run(["git", "commit", "-m", f"Bump version to {version}"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def create_tag(version, annotation): | ||||||
|  |     run(["git", "tag", "-am", annotation, f"v{version}"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fastforward_latest(): | ||||||
|  |     run(["git", "branch", "-f", "latest", "HEAD"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     parser = argparse.ArgumentParser() | ||||||
|  |     parser.add_argument("version") | ||||||
|  |     args = parser.parse_args() | ||||||
|  |  | ||||||
|  |     version = args.version | ||||||
|  |     date = time.strftime("%Y-%m-%d") | ||||||
|  |     changelog = load_changelog() | ||||||
|  |     changes = extract_changes(changelog) | ||||||
|  |     annotation = f"Version {version} - {date}\n\n{''.join(changes)}" | ||||||
|  |  | ||||||
|  |     update_version(version) | ||||||
|  |     update_changelog(changelog, version, date) | ||||||
|  |     commit_changes(version) | ||||||
|  |     create_tag(version, annotation) | ||||||
|  |     fastforward_latest() | ||||||
|  |  | ||||||
|  |     print() | ||||||
|  |     print("Now the only thing left is to publish the changes:") | ||||||
|  |     print(f"  $ git push origin master latest v{version}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | mypy . | ||||||
|  | flake8 PFERD | ||||||
							
								
								
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | autopep8 . | ||||||
|  | isort . | ||||||
							
								
								
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,17 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | # Updating pip and setuptools because some older versions don't recognize the | ||||||
|  | # project setup correctly | ||||||
|  | if [[ $1 != '--no-pip' ]]; then | ||||||
|  |     pip install --upgrade pip | ||||||
|  | fi | ||||||
|  | pip install --upgrade setuptools | ||||||
|  |  | ||||||
|  | # Installing PFERD itself | ||||||
|  | pip install --editable . | ||||||
|  |  | ||||||
|  | # Installing tools and type hints | ||||||
|  | pip install --upgrade mypy flake8 flake8-pyproject autopep8 isort pyinstaller | ||||||
|  | pip install --upgrade types-chardet types-certifi | ||||||
							
								
								
									
										16
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								setup.py
									
									
									
									
									
								
							| @@ -1,16 +0,0 @@ | |||||||
| from setuptools import find_packages, setup |  | ||||||
|  |  | ||||||
| setup( |  | ||||||
|     name="PFERD", |  | ||||||
|     version="2.4.5", |  | ||||||
|     packages=find_packages(), |  | ||||||
|     install_requires=[ |  | ||||||
|         "requests>=2.21.0", |  | ||||||
|         "beautifulsoup4>=4.7.1", |  | ||||||
|         "rich>=2.1.0" |  | ||||||
|     ], |  | ||||||
| ) |  | ||||||
|  |  | ||||||
| # When updating the version, also: |  | ||||||
| # - update the README.md installation instructions |  | ||||||
| # - set a tag on the update commit |  | ||||||
							
								
								
									
										69
									
								
								sync_url.py
									
									
									
									
									
								
							
							
						
						
									
										69
									
								
								sync_url.py
									
									
									
									
									
								
							| @@ -1,69 +0,0 @@ | |||||||
| #!/usr/bin/env python |  | ||||||
|  |  | ||||||
| """ |  | ||||||
| A simple script to download a course by name from ILIAS. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import argparse |  | ||||||
| from pathlib import Path |  | ||||||
| from urllib.parse import urlparse |  | ||||||
|  |  | ||||||
| from PFERD import Pferd |  | ||||||
| from PFERD.cookie_jar import CookieJar |  | ||||||
| from PFERD.ilias import (IliasCrawler, IliasElementType, |  | ||||||
|                          KitShibbolethAuthenticator) |  | ||||||
| from PFERD.transform import sanitize_windows_path |  | ||||||
| from PFERD.utils import to_path |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def main() -> None: |  | ||||||
|     parser = argparse.ArgumentParser() |  | ||||||
|     parser.add_argument("--test-run", action="store_true") |  | ||||||
|     parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") |  | ||||||
|     parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") |  | ||||||
|     parser.add_argument('url', help="URL to the course page") |  | ||||||
|     parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") |  | ||||||
|     args = parser.parse_args() |  | ||||||
|  |  | ||||||
|     url = urlparse(args.url) |  | ||||||
|  |  | ||||||
|     cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) |  | ||||||
|     session = cookie_jar.create_session() |  | ||||||
|     authenticator = KitShibbolethAuthenticator() |  | ||||||
|     crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, |  | ||||||
|                            authenticator, lambda x, y: True) |  | ||||||
|  |  | ||||||
|     cookie_jar.load_cookies() |  | ||||||
|  |  | ||||||
|     if args.folder is not None: |  | ||||||
|         folder = args.folder |  | ||||||
|         # Initialize pferd at the *parent of the passed folder* |  | ||||||
|         # This is needed so Pferd's internal protections against escaping the working directory |  | ||||||
|         # do not trigger (e.g. if somebody names a file in ILIAS '../../bad thing.txt') |  | ||||||
|         pferd = Pferd(Path(Path(__file__).parent, folder).parent, test_run=args.test_run) |  | ||||||
|     else: |  | ||||||
|         # fetch course name from ilias |  | ||||||
|         folder = crawler.find_element_name(args.url) |  | ||||||
|         cookie_jar.save_cookies() |  | ||||||
|  |  | ||||||
|         # Initialize pferd at the location of the script |  | ||||||
|         pferd = Pferd(Path(__file__).parent, test_run=args.test_run) |  | ||||||
|  |  | ||||||
|     def dir_filter(_: Path, element: IliasElementType) -> bool: |  | ||||||
|         if args.no_videos: |  | ||||||
|             return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER] |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     pferd.enable_logging() |  | ||||||
|     # fetch |  | ||||||
|     pferd.ilias_kit_folder( |  | ||||||
|         target=folder, |  | ||||||
|         full_url=args.url, |  | ||||||
|         cookies=args.cookies, |  | ||||||
|         dir_filter=dir_filter, |  | ||||||
|         transform=sanitize_windows_path |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     main() |  | ||||||
		Reference in New Issue
	
	Block a user