mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-23 10:02:32 +02:00 
			
		
		
		
	Compare commits
	
		
			390 Commits
		
	
	
		
			v2.4.2
			...
			update-che
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 2d145e7c94 | ||
|   | 5fdd40204b | ||
|   | fb4631ba18 | ||
|   | d72fc2760b | ||
|   | 4a51aaa4f5 | ||
|   | 66a5b1ba02 | ||
|   | aa5a3a10bc | ||
|   | d9b111cec2 | ||
|   | 345f52a1f6 | ||
|   | ed24366aba | ||
|   | 46fb782798 | ||
|   | 846c29aee1 | ||
|   | a5015fe9b1 | ||
|   | 616b0480f7 | ||
|   | 2f0e04ce13 | ||
|   | bcc537468c | ||
|   | 694ffb4d77 | ||
|   | af2cc1169a | ||
|   | bc3fa36637 | ||
|   | afbd03f777 | ||
|   | b8fe25c580 | ||
|   | a241672726 | ||
|   | a8f76e9be7 | ||
|   | b56475450d | ||
|   | aa74604d29 | ||
|   | d2e6d91880 | ||
|   | 602044ff1b | ||
|   | 31631fb409 | ||
|   | 00db348218 | ||
|   | a709280cbf | ||
|   | a99ddaa0cc | ||
|   | ba3d299c05 | ||
|   | 07a21f80a6 | ||
|   | f17b9b68f4 | ||
|   | a2831fbea2 | ||
|   | da72863b47 | ||
|   | 86e2e226dc | ||
|   | 7872fe5221 | ||
|   | 86947e4874 | ||
|   | 4f022e2d19 | ||
|   | f47e7374d2 | ||
|   | 57ec51e95a | ||
|   | 0045124a4e | ||
|   | 9618aae83b | ||
|   | 33453ede2d | ||
|   | e467b38d73 | ||
|   | e9d2d05030 | ||
|   | 4bf0c972e6 | ||
|   | 4ee919625d | ||
|   | d30f25ee97 | ||
|   | 10d9d74528 | ||
|   | 43c5453e10 | ||
|   | eb4de8ae0c | ||
|   | e32c1f000f | ||
|   | 5f527bc697 | ||
|   | ced8b9a2d0 | ||
|   | 6f3cfd4396 | ||
|   | 462d993fbc | ||
|   | a99356f2a2 | ||
|   | eac2e34161 | ||
|   | a82a0b19c2 | ||
|   | 90cb6e989b | ||
|   | 6289938d7c | ||
|   | 13b8c3d9c6 | ||
|   | 88afe64a92 | ||
|   | 6b2a657573 | ||
|   | d6f38a61e1 | ||
|   | ad3f4955f7 | ||
|   | e42ab83d32 | ||
|   | f9a3f9b9f2 | ||
|   | ef7d5ea2d3 | ||
|   | 55ea304ff3 | ||
|   | fee12b3d9e | ||
|   | 6673077397 | ||
|   | 742632ed8d | ||
|   | 544d45cbc5 | ||
|   | 86f79ff1f1 | ||
|   | ee67f9f472 | ||
|   | 8ec3f41251 | ||
|   | 89be07d4d3 | ||
|   | 91200f3684 | ||
|   | 9ffd603357 | ||
|   | 80eeb8fe97 | ||
|   | 75fde870c2 | ||
|   | 6e4d423c81 | ||
|   | 57aef26217 | ||
|   | 70ec64a48b | ||
|   | 70b33ecfd9 | ||
|   | 601e4b936b | ||
|   | a292c4c437 | ||
|   | bc65ea7ab6 | ||
|   | f28bbe6b0c | ||
|   | 61d902d715 | ||
|   | 8ab462fb87 | ||
|   | df3ad3d890 | ||
|   | fc31100a0f | ||
|   | 31b6311e99 | ||
|   | 1fc8e9eb7a | ||
|   | 85b9f45085 | ||
|   | f656e3ff34 | ||
|   | e1bda94329 | ||
|   | f6b26f4ead | ||
|   | 722970a255 | ||
|   | f40820c41f | ||
|   | 49ad1b6e46 | ||
|   | 1ce32d2f18 | ||
|   | 9d5ec84b91 | ||
|   | 1fba96abcb | ||
|   | 921cec7ddc | ||
|   | 7b062883f6 | ||
|   | 64a2960751 | ||
|   | 17879a7f69 | ||
|   | 1dd24551a5 | ||
|   | 84f775013f | ||
|   | b78eb64f3d | ||
|   | d65efed561 | ||
|   | 1ca6740e05 | ||
|   | 474aa7e1cc | ||
|   | 5beb4d9a2d | ||
|   | 19eed5bdff | ||
|   | 6fa9cfd4c3 | ||
|   | 80acc4b50d | ||
|   | 2c72a9112c | ||
|   | 17207546e9 | ||
|   | 533f75ea71 | ||
|   | adb5d4ade3 | ||
|   | a879c6ab6e | ||
|   | 915e42fd07 | ||
|   | 2d8dcc87ff | ||
|   | 66f0e398a1 | ||
|   | 30be4e29fa | ||
|   | 263780e6a3 | ||
|   | 07a75a37c3 | ||
|   | f85b75df8c | ||
|   | 6644126b5d | ||
|   | c665c36d88 | ||
|   | 519a7ef435 | ||
|   | a848194601 | ||
|   | aabce764ac | ||
|   | 5a331663e4 | ||
|   | 40144f8bd8 | ||
|   | f68849c65f | ||
|   | edb52a989e | ||
|   | 980578d05a | ||
|   | 486699cef3 | ||
|   | 0096a0c077 | ||
|   | d905e95dbb | ||
|   | 61430c8739 | ||
|   | eb8b915813 | ||
|   | 22c2259adb | ||
|   | c15a1aecdf | ||
|   | 16d50b6626 | ||
|   | 651b087932 | ||
|   | bce3dc384d | ||
|   | c21ddf225b | ||
|   | 4fefb98d71 | ||
|   | ffda4e43df | ||
|   | 69cb2a7734 | ||
|   | c33de233dc | ||
|   | 85f89a7ff3 | ||
|   | 9ce20216b5 | ||
|   | 1739c54091 | ||
|   | d8bd1f518a | ||
|   | 86ba47541b | ||
|   | 492ec6a932 | ||
|   | 342076ee0e | ||
|   | d44f6966c2 | ||
|   | 5c76193045 | ||
|   | 1c1f781be4 | ||
|   | c687d4a51a | ||
|   | fca62541ca | ||
|   | 3ab3581f84 | ||
|   | 8dd0689420 | ||
|   | be4b1040f8 | ||
|   | 79be6e1dc5 | ||
|   | edbd92dbbf | ||
|   | 27b5a8e490 | ||
|   | 1f400d5964 | ||
|   | 0ca0680165 | ||
|   | ce1dbda5b4 | ||
|   | 9cce78669f | ||
|   | 6ca0ecdf05 | ||
|   | 6e9f8fd391 | ||
|   | 2fdf24495b | ||
|   | bbf9f8f130 | ||
|   | 37f8d84a9c | ||
|   | 5edd868d5b | ||
|   | e4e5e83be6 | ||
|   | 74c7b39dc8 | ||
|   | 445dffc987 | ||
|   | d97d6bf147 | ||
|   | 79efdb56f7 | ||
|   | a9af56a5e9 | ||
|   | 59f13bb8d6 | ||
|   | 463f8830d7 | ||
|   | 05ad06fbc1 | ||
|   | 29d5a40c57 | ||
|   | c0cecf8363 | ||
|   | b998339002 | ||
|   | 245c9c3dcc | ||
|   | d8f26a789e | ||
|   | e1d18708b3 | ||
|   | b44b49476d | ||
|   | 7e0bb06259 | ||
|   | ecdedfa1cf | ||
|   | 3d4b997d4a | ||
|   | e81005ae4b | ||
|   | 33a81a5f5c | ||
|   | 25e2abdb03 | ||
|   | 803e5628a2 | ||
|   | c88f20859a | ||
|   | ec3767c545 | ||
|   | 729ff0a4c7 | ||
|   | 6fe51e258f | ||
|   | 44ecb2fbe7 | ||
|   | 53e031d9f6 | ||
|   | 8ac85ea0bd | ||
|   | adfdc302d7 | ||
|   | 3053278721 | ||
|   | 4d07de0d71 | ||
|   | 953a1bba93 | ||
|   | e724ff7c93 | ||
|   | 62f0f7bfc5 | ||
|   | 9cb2b68f09 | ||
|   | 1bbc0b705f | ||
|   | 662191eca9 | ||
|   | 8fad8edc1e | ||
|   | ae3d80664c | ||
|   | e21795ee35 | ||
|   | ec95dda18f | ||
|   | 098ac45758 | ||
|   | 9889ce6b57 | ||
|   | b4d97cd545 | ||
|   | afac22c562 | ||
|   | 552cd82802 | ||
|   | dfde0e2310 | ||
|   | 54dd2f8337 | ||
|   | b5785f260e | ||
|   | 98b8ca31fa | ||
|   | 4b104b6252 | ||
|   | 83d12fcf2d | ||
|   | e4f9560655 | ||
|   | 8cfa818f04 | ||
|   | 81301f3a76 | ||
|   | 2976b4d352 | ||
|   | 9f03702e69 | ||
|   | 3300886120 | ||
|   | 0d10752b5a | ||
|   | 92886fb8d8 | ||
|   | 5916626399 | ||
|   | a7c025fd86 | ||
|   | b7a999bc2e | ||
|   | 3851065500 | ||
|   | 4b68fa771f | ||
|   | 1525aa15a6 | ||
|   | db1219d4a9 | ||
|   | b8efcc2ca5 | ||
|   | 0bae009189 | ||
|   | 3efec53f51 | ||
|   | 8b76ebb3ef | ||
|   | 467ea3a37e | ||
|   | 2b6235dc78 | ||
|   | cd5aa61834 | ||
|   | 5ccb17622e | ||
|   | 1c226c31aa | ||
|   | 9ec0d3e16a | ||
|   | cf6903d109 | ||
|   | 9fd356d290 | ||
|   | 989032fe0c | ||
|   | 05573ccc53 | ||
|   | c454fabc9d | ||
|   | 7d323ec62b | ||
|   | c7494e32ce | ||
|   | 1123c8884d | ||
|   | e1104f888d | ||
|   | 8c32da7f19 | ||
|   | d63494908d | ||
|   | b70b62cef5 | ||
|   | 868f486922 | ||
|   | b2a2b5999b | ||
|   | 595de88d96 | ||
|   | a6fdf05ee9 | ||
|   | f897d7c2e1 | ||
|   | b0f731bf84 | ||
|   | 302b8c0c34 | ||
|   | acd674f0a0 | ||
|   | b0f9e1e8b4 | ||
|   | ed2e19a150 | ||
|   | 296a169dd3 | ||
|   | 1591cb9197 | ||
|   | 0c9167512c | ||
|   | a673ab0fae | ||
|   | 6e5fdf4e9e | ||
|   | 93a5a94dab | ||
|   | d565df27b3 | ||
|   | 961f40f9a1 | ||
|   | e3ee4e515d | ||
|   | 94d6a01cca | ||
|   | 38bb66a776 | ||
|   | 68781a88ab | ||
|   | 910462bb72 | ||
|   | 6bd6adb977 | ||
|   | 0acdee15a0 | ||
|   | c3ce6bb31c | ||
|   | 0459ed093e | ||
|   | d5f29f01c5 | ||
|   | 595ba8b7ab | ||
|   | cec0a8e1fc | ||
|   | f9b2fd60e2 | ||
|   | 60cd9873bc | ||
|   | 273d56c39a | ||
|   | 5497dd2827 | ||
|   | bbfdadc463 | ||
|   | fde811ae5a | ||
|   | 07e831218e | ||
|   | 91c33596da | ||
|   | a8dcf941b9 | ||
|   | e7a51decb0 | ||
|   | 9ec19be113 | ||
|   | f776186480 | ||
|   | 0096d83387 | ||
|   | 20a24dbcbf | ||
|   | 502654d853 | ||
|   | d2103d7c44 | ||
|   | d96a361325 | ||
|   | 2e85d26b6b | ||
|   | 6431a3fb3d | ||
|   | ac3bfd7388 | ||
|   | 3ea86d18a0 | ||
|   | bbc792f9fb | ||
|   | 7e127cd5cc | ||
|   | c4fb92c658 | ||
|   | 8da1ac6cee | ||
|   | a18db57e6f | ||
|   | b915e393dd | ||
|   | 3a74c23d09 | ||
|   | fbebc46c58 | ||
|   | 5595a908d8 | ||
|   | 27e4abcfa3 | ||
|   | c1ab7485e2 | ||
|   | 29cd5d1a3c | ||
|   | 6d5d9333ad | ||
|   | 7cc40595dc | ||
|   | 80ae5ddfaa | ||
|   | 4f480d117e | ||
|   | 1f2af3a290 | ||
|   | 14cdfb6a69 | ||
|   | e2bf84392b | ||
|   | 946b7a7931 | ||
|   | 9a9018751e | ||
|   | 83b75e8254 | ||
|   | 35c3fa205d | ||
|   | 0b606f02fa | ||
|   | fb78a6e98e | ||
|   | 5de68a0400 | ||
|   | f0562049b6 | ||
|   | 0e1077bb50 | ||
|   | c978e9edf4 | ||
|   | 2714ac6be6 | ||
|   | 9b048a9cfc | ||
|   | 1c2b6bf994 | ||
|   | ee39aaf08b | ||
|   | 93e6329901 | ||
|   | f47b137b59 | ||
|   | 83ea15ee83 | ||
|   | 75471c46d1 | ||
|   | 1e0343bba6 | ||
|   | 0f5e55648b | ||
|   | 57259e21f4 | ||
|   | 4ce385b262 | ||
|   | 2d64409542 | ||
|   | fcb3884a8f | ||
|   | 9f6dc56a7b | ||
|   | 56ab473611 | ||
|   | 6426060804 | ||
|   | 49a0ca7a7c | ||
|   | f3a4663491 | ||
|   | ecdbca8fb6 | ||
|   | 9cbea5fe06 | ||
|   | ba3c7f85fa | ||
|   | ba9215ebe8 | ||
|   | 8ebf0eab16 | ||
|   | cd90a60dee | ||
|   | 98834c9c95 | ||
|   | 55e9e719ad | ||
|   | a0ae9aee27 | ||
|   | 1486a63854 | ||
|   | 733e1ae136 | ||
|   | 4ac51048c1 | ||
|   | f2aba970fd | 
							
								
								
									
										78
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,78 @@ | ||||
| name: build-and-release | ||||
|  | ||||
| on: push | ||||
|  | ||||
| defaults: | ||||
|   run: | ||||
|     shell: bash | ||||
|  | ||||
| jobs: | ||||
|  | ||||
|   build: | ||||
|     runs-on: ${{ matrix.os }} | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|         python: ["3.9"] | ||||
|     steps: | ||||
|  | ||||
|       - uses: actions/checkout@v2 | ||||
|  | ||||
|       - uses: actions/setup-python@v2 | ||||
|         with: | ||||
|           python-version: ${{ matrix.python }} | ||||
|  | ||||
|       - name: Set up project | ||||
|         if: matrix.os != 'windows-latest' | ||||
|         run: ./scripts/setup | ||||
|  | ||||
|       - name: Set up project on windows | ||||
|         if: matrix.os == 'windows-latest' | ||||
|         # For some reason, `pip install --upgrade pip` doesn't work on | ||||
|         # 'windows-latest'. The installed pip version works fine however. | ||||
|         run: ./scripts/setup --no-pip | ||||
|  | ||||
|       - name: Run checks | ||||
|         run: ./scripts/check | ||||
|  | ||||
|       - name: Build | ||||
|         run: ./scripts/build | ||||
|  | ||||
|       - name: Rename binary | ||||
|         # Glob in source location because on windows pyinstaller creates a file | ||||
|         # named "pferd.exe" | ||||
|         run: mv dist/pferd* dist/pferd-${{ matrix.os }} | ||||
|  | ||||
|       - name: Upload binary | ||||
|         uses: actions/upload-artifact@v2 | ||||
|         with: | ||||
|           name: Binaries | ||||
|           path: dist/pferd-${{ matrix.os }} | ||||
|  | ||||
|   release: | ||||
|     runs-on: ubuntu-latest | ||||
|     if: startsWith(github.ref, 'refs/tags/v') | ||||
|     needs: build | ||||
|     steps: | ||||
|  | ||||
|       - name: Download binaries | ||||
|         uses: actions/download-artifact@v2 | ||||
|         with: | ||||
|           name: Binaries | ||||
|  | ||||
|       - name: Rename binaries | ||||
|         run: | | ||||
|           mv pferd-ubuntu-latest pferd-linux | ||||
|           mv pferd-windows-latest pferd-windows.exe | ||||
|           mv pferd-macos-latest pferd-mac | ||||
|  | ||||
|       - name: Create release | ||||
|         uses: softprops/action-gh-release@v1 | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|         with: | ||||
|           files: | | ||||
|             pferd-linux | ||||
|             pferd-windows.exe | ||||
|             pferd-mac | ||||
							
								
								
									
										74
									
								
								.github/workflows/package.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										74
									
								
								.github/workflows/package.yml
									
									
									
									
										vendored
									
									
								
							| @@ -1,74 +0,0 @@ | ||||
| name: Package Application with Pyinstaller | ||||
|  | ||||
| on: | ||||
|   push: | ||||
|     branches: | ||||
|       - "*" | ||||
|     tags: | ||||
|       - "v*" | ||||
|  | ||||
| jobs: | ||||
|   build: | ||||
|  | ||||
|     runs-on: ${{ matrix.os }} | ||||
|     strategy: | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|  | ||||
|     steps: | ||||
|     - uses: actions/checkout@v2 | ||||
|  | ||||
|     - uses: actions/setup-python@v2 | ||||
|       with: | ||||
|         python-version: '3.x' | ||||
|  | ||||
|     - name: "Install dependencies" | ||||
|       run: "pip install setuptools pyinstaller rich requests beautifulsoup4 -f --upgrade" | ||||
|  | ||||
|     - name: "Install sync_url.py" | ||||
|       run: "pyinstaller sync_url.py -F" | ||||
|  | ||||
|     - name: "Move artifact" | ||||
|       run: "mv dist/sync_url* dist/sync_url-${{ matrix.os }}" | ||||
|  | ||||
|     - uses: actions/upload-artifact@v2 | ||||
|       with: | ||||
|         name: "Pferd Sync URL" | ||||
|         path: "dist/sync_url*" | ||||
|  | ||||
|   release: | ||||
|     name: Release | ||||
|  | ||||
|     needs: [build] | ||||
|     runs-on: ubuntu-latest | ||||
|     if: startsWith(github.ref, 'refs/tags/') | ||||
|  | ||||
|     env: | ||||
|       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
|     steps: | ||||
|     - name: "Checkout" | ||||
|       uses: actions/checkout@v2 | ||||
|  | ||||
|     - name: "Download artifacts" | ||||
|       uses: actions/download-artifact@v2 | ||||
|       with: | ||||
|         name: "Pferd Sync URL" | ||||
|  | ||||
|     - name: "look at folder structure" | ||||
|       run: "ls -lah" | ||||
|  | ||||
|     - name: "Rename releases" | ||||
|       run: "mv sync_url-macos-latest pferd_sync_url_mac && mv sync_url-ubuntu-latest pferd_sync_url_linux && mv sync_url-windows-latest pferd_sync_url.exe" | ||||
|  | ||||
|     - name: "Create release" | ||||
|       uses: softprops/action-gh-release@v1 | ||||
|  | ||||
|     - name: "Upload release artifacts" | ||||
|       uses: softprops/action-gh-release@v1 | ||||
|       with: | ||||
|         body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x <file>`. Also please enclose the *url you pass to the program in double quotes* or your shell might silently screw it up!" | ||||
|         files: | | ||||
|           pferd_sync_url_mac | ||||
|           pferd_sync_url_linux | ||||
|           pferd_sync_url.exe | ||||
							
								
								
									
										20
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										20
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,14 +1,10 @@ | ||||
| __pycache__/ | ||||
| .venv/ | ||||
| venv/ | ||||
| .idea/ | ||||
| build/ | ||||
| .mypy_cache/ | ||||
| .tmp/ | ||||
| .env | ||||
| .vscode | ||||
| ilias_cookies.txt | ||||
| /.venv/ | ||||
| /PFERD.egg-info/ | ||||
| __pycache__/ | ||||
| /.vscode/ | ||||
|  | ||||
| # PyInstaller | ||||
| sync_url.spec | ||||
| dist/ | ||||
| # pyinstaller | ||||
| /pferd.spec | ||||
| /build/ | ||||
| /dist/ | ||||
|   | ||||
							
								
								
									
										164
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,164 @@ | ||||
| # Changelog | ||||
|  | ||||
| All notable changes to this project will be documented in this file. The format | ||||
| is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). | ||||
|  | ||||
| This project has its own custom versioning scheme. Version numbers consist of | ||||
| three parts (e. g. `3.1.5`). | ||||
| - The first number is increased on major rewrites or changes. What classifies as | ||||
|   a major change is up to the maintainers. This is pretty rare and a PFERD | ||||
|   version 4 should hopefully not be necessary. | ||||
| - The second number is increased on backwards-incompatible changes in behaviour. | ||||
|   This refers to any change that would make an existing setup behave differently | ||||
|   (e. g. renaming options or changing crawler behaviour). If this number is | ||||
|   increased, it may be necessary for you to adapt your own setup. | ||||
| - The third number is increased on backwards-compatible changes (e. g. adding | ||||
|   new options or commands, changing documentation, fixing bugs). Updates that | ||||
|   only increase this number should be safe and not require manual intervention. | ||||
|  | ||||
| We will try to correctly classify changes as backwards-compatible or | ||||
| backwards-incompatible, but may occasionally make mistakes or stumble across | ||||
| ambiguous situations. | ||||
|  | ||||
| ## Unreleased | ||||
|  | ||||
| ### Fixed | ||||
| - Forum crawling crashing when parsing empty (= 0 messages) threads | ||||
| - Forum crawling crashing when a forum has no threads at all | ||||
|  | ||||
| ## 3.4.1 - 2022-08-17 | ||||
|  | ||||
| ### Added | ||||
| - Download of page descriptions | ||||
| - Forum download support | ||||
| - `pass` authenticator | ||||
|  | ||||
| ### Changed | ||||
| - Add `cpp` extension to default `link_regex` of IPD crawler | ||||
| - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option | ||||
| - Simplify default IPD crawler `link_regex` | ||||
|  | ||||
| ### Fixed | ||||
| - IPD crawler crashes on some sites | ||||
| - Meeting name normalization for yesterday, today and tomorrow | ||||
| - Crawling of meeting file previews | ||||
| - Login with new login button html layout | ||||
| - Descriptions for courses are now placed in the correct subfolder when | ||||
|   downloading the whole desktop | ||||
|  | ||||
| ## 3.4.0 - 2022-05-01 | ||||
|  | ||||
| ### Added | ||||
| - Message when Shibboleth entitlements need to be manually reviewed | ||||
| - Links to unofficial packages and repology in the readme | ||||
|  | ||||
| ### Changed | ||||
| - Increase minimum supported Python version to 3.9 | ||||
| - Support video listings with more columns | ||||
| - Use UTF-8 when reading/writing the config file | ||||
|  | ||||
| ### Fixed | ||||
| - Crash during authentication when the Shibboleth session is still valid | ||||
|  | ||||
| ## 3.3.1 - 2022-01-15 | ||||
|  | ||||
| ### Fixed | ||||
| - ILIAS login | ||||
| - Local video cache if `windows_paths` is enabled | ||||
|  | ||||
| ## 3.3.0 - 2022-01-09 | ||||
|  | ||||
| ### Added | ||||
| - A KIT IPD crawler | ||||
| - Support for ILIAS cards | ||||
| - (Rudimentary) support for content pages | ||||
| - Support for multi-stream videos | ||||
| - Support for ILIAS 7 | ||||
|  | ||||
| ### Removed | ||||
| - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file | ||||
|  | ||||
| ### Fixed | ||||
| - Crawling of recursive courses | ||||
| - Crawling files directly placed on the personal desktop | ||||
| - Ignore timestamps at the unix epoch as they crash on windows | ||||
|  | ||||
| ## 3.2.0 - 2021-08-04 | ||||
|  | ||||
| ### Added | ||||
| - `--skip` command line option | ||||
| - Support for ILIAS booking objects | ||||
|  | ||||
| ### Changed | ||||
| - Using multiple path segments on left side of `-name->` now results in an | ||||
|   error. This was already forbidden by the documentation but silently accepted | ||||
|   by PFERD. | ||||
| - More consistent path printing in some `--explain` messages | ||||
|  | ||||
| ### Fixed | ||||
| - Nondeterministic name deduplication due to ILIAS reordering elements | ||||
| - More exceptions are handled properly | ||||
|  | ||||
| ## 3.1.0 - 2021-06-13 | ||||
|  | ||||
| If your config file doesn't do weird things with transforms, it should continue | ||||
| to work. If your `-re->` arrows behave weirdly, try replacing them with | ||||
| `-exact-re->` arrows. If you're on Windows, you might need to switch from `\` | ||||
| path separators to `/` in your regex rules. | ||||
|  | ||||
| ### Added | ||||
| - `skip` option for crawlers | ||||
| - Rules with `>>` instead of `>` as arrow head | ||||
| - `-exact-re->` arrow (behaves like `-re->` did previously) | ||||
|  | ||||
| ### Changed | ||||
| - The `-re->` arrow can now rename directories (like `-->`) | ||||
| - Use `/` instead of `\` as path separator for (regex) rules on Windows | ||||
| - Use the label to the left for exercises instead of the button name to | ||||
|   determine the folder name | ||||
|  | ||||
| ### Fixed | ||||
| - Video pagination handling in ILIAS crawler | ||||
|  | ||||
| ## 3.0.1 - 2021-06-01 | ||||
|  | ||||
| ### Added | ||||
| - `credential-file` authenticator | ||||
| - `--credential-file` option for `kit-ilias-web` command | ||||
| - Warning if using concurrent tasks with `kit-ilias-web` | ||||
|  | ||||
| ### Changed | ||||
| - Cookies are now stored in a text-based format | ||||
|  | ||||
| ### Fixed | ||||
| - Date parsing now also works correctly in non-group exercises | ||||
|  | ||||
| ## 3.0.0 - 2021-05-31 | ||||
|  | ||||
| ### Added | ||||
| - Proper config files | ||||
| - Concurrent crawling | ||||
| - Crawl external ILIAS links | ||||
| - Crawl uploaded exercise solutions | ||||
| - Explain what PFERD is doing and why (`--explain`) | ||||
| - More control over output (`--status`, `--report`) | ||||
| - Debug transform rules with `--debug-transforms` | ||||
| - Print report after exiting via Ctrl+C | ||||
| - Store crawler reports in `.report` JSON file | ||||
| - Extensive config file documentation (`CONFIG.md`) | ||||
| - Documentation for developers (`DEV.md`) | ||||
| - This changelog | ||||
|  | ||||
| ### Changed | ||||
| - Rewrote almost everything | ||||
| - Better error messages | ||||
| - Redesigned CLI | ||||
| - Redesigned transform rules | ||||
| - ILIAS crawling logic (paths may be different) | ||||
| - Better support for weird paths on Windows | ||||
| - Set user agent (`PFERD/<version>`) | ||||
|  | ||||
| ### Removed | ||||
| - Backwards compatibility with 2.x | ||||
| - Python files as config files | ||||
| - Some types of crawlers | ||||
							
								
								
									
										472
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										472
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,472 @@ | ||||
| # Config file format | ||||
|  | ||||
| A config file consists of sections. A section begins with a `[section]` header, | ||||
| which is followed by a list of `key = value` pairs. Comments must be on their | ||||
| own line and start with `#`. Multiline values must be indented beyond their key. | ||||
| Boolean values can be `yes` or `no`. For more details and some examples on the | ||||
| format, see the [configparser documentation][1] ([interpolation][2] is | ||||
| disabled). | ||||
|  | ||||
| [1]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure" | ||||
| [2]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values" | ||||
|  | ||||
| ## The `DEFAULT` section | ||||
|  | ||||
| This section contains global configuration values. It can also be used to set | ||||
| default values for the other sections. | ||||
|  | ||||
| - `working_dir`: The directory PFERD operates in. Set to an absolute path to | ||||
|   make PFERD operate the same regardless of where it is executed from. All other | ||||
|   paths in the config file are interpreted relative to this path. If this path | ||||
|   is relative, it is interpreted relative to the script's working dir. `~` is | ||||
|   expanded to the current user's home directory. (Default: `.`) | ||||
| - `explain`: Whether PFERD should log and explain its actions and decisions in | ||||
|   detail. (Default: `no`) | ||||
| - `status`: Whether PFERD should print status updates (like `Crawled ...`, | ||||
|   `Added ...`) while running a crawler. (Default: `yes`) | ||||
| - `report`: Whether PFERD should print a report of added, changed and deleted | ||||
|    local files for all crawlers before exiting. (Default: `yes`) | ||||
| - `share_cookies`: Whether crawlers should share cookies where applicable. For | ||||
|   example, some crawlers share cookies if they crawl the same website using the | ||||
|   same account. (Default: `yes`) | ||||
|  | ||||
| ## The `crawl:*` sections | ||||
|  | ||||
| Sections whose names start with `crawl:` are used to configure crawlers. The | ||||
| rest of the section name specifies the name of the crawler. | ||||
|  | ||||
| A crawler synchronizes a remote resource to a local directory. There are | ||||
| different types of crawlers for different kinds of resources, e.g. ILIAS | ||||
| courses or lecture websites. | ||||
|  | ||||
| Each crawl section represents an instance of a specific type of crawler. The | ||||
| `type` option is used to specify the crawler type. The crawler's name is usually | ||||
| used as the output directory. New crawlers can be created simply by adding a new | ||||
| crawl section to the config file. | ||||
|  | ||||
| Depending on a crawler's type, it may have different options. For more details, | ||||
| see the type's [documentation](#crawler-types) below. The following options are | ||||
| common to all crawlers: | ||||
|  | ||||
| - `type`: The available types are specified in [this section](#crawler-types). | ||||
| - `skip`: Whether the crawler should be skipped during normal execution. The | ||||
|   crawler can still be executed manually using the `--crawler` or `-C` flags. | ||||
|   (Default: `no`) | ||||
| - `output_dir`: The directory the crawler synchronizes files to. A crawler will | ||||
|   never place any files outside this directory. (Default: the crawler's name) | ||||
| - `redownload`: When to download a file that is already present locally. | ||||
|   (Default: `never-smart`) | ||||
|     - `never`: If a file is present locally, it is not downloaded again. | ||||
|     - `never-smart`: Like `never`, but PFERD tries to detect if an already | ||||
|       downloaded files has changed via some (unreliable) heuristics. | ||||
|     - `always`: All files are always downloaded, regardless of whether they are | ||||
|       already present locally. | ||||
|     - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary | ||||
|       downloads via some (unreliable) heuristics. | ||||
| - `on_conflict`: What to do when the local and remote versions of a file or | ||||
|   directory differ, including when a file is replaced by a directory or a | ||||
|   directory by a file. (Default: `prompt`) | ||||
|     - `prompt`: Always ask the user before overwriting or deleting local files | ||||
|       and directories. | ||||
|     - `local-first`: Always keep the local file or directory. Equivalent to | ||||
|       using `prompt` and always choosing "no". Implies that `redownload` is set | ||||
|       to `never`. | ||||
|     - `remote-first`: Always keep the remote file or directory. Equivalent to | ||||
|       using `prompt` and always choosing "yes". | ||||
|     - `no-delete`: Never delete local files, but overwrite local files if the | ||||
|       remote file is different. | ||||
| - `transform`: Rules for renaming and excluding certain files and directories. | ||||
|   For more details, see [this section](#transformation-rules). (Default: empty) | ||||
| - `tasks`: The maximum number of concurrent tasks (such as crawling or | ||||
|   downloading). (Default: `1`) | ||||
| - `downloads`: How many of those tasks can be download tasks at the same time. | ||||
|   Must not be greater than `tasks`. (Default: Same as `tasks`) | ||||
| - `task_delay`: Time (in seconds) that the crawler should wait between | ||||
|   subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary | ||||
|   load for the crawl target. (Default: `0.0`) | ||||
| - `windows_paths`: Whether PFERD should find alternative names for paths that | ||||
|   are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) | ||||
|  | ||||
| Some crawlers may also require credentials for authentication. To configure how | ||||
| the crawler obtains its credentials, the `auth` option is used. It is set to the | ||||
| full name of an auth section (including the `auth:` prefix). | ||||
|  | ||||
| Here is a simple example: | ||||
|  | ||||
| ```ini | ||||
| [auth:example] | ||||
| type = simple | ||||
| username = foo | ||||
| password = bar | ||||
|  | ||||
| [crawl:something] | ||||
| type = some-complex-crawler | ||||
| auth = auth:example | ||||
| on_conflict = no-delete | ||||
| tasks = 3 | ||||
| ``` | ||||
|  | ||||
| ## The `auth:*` sections | ||||
|  | ||||
| Sections whose names start with `auth:` are used to configure authenticators. An | ||||
| authenticator provides a username and a password to one or more crawlers. | ||||
|  | ||||
| Authenticators work similar to crawlers: A section represents an authenticator | ||||
| instance whose name is the rest of the section name. The type is specified by | ||||
| the `type` option. | ||||
|  | ||||
| Depending on an authenticator's type, it may have different options. For more | ||||
| details, see the type's [documentation](#authenticator-types) below. The only | ||||
| option common to all authenticators is `type`: | ||||
|  | ||||
| - `type`: The types are specified in [this section](#authenticator-types). | ||||
|  | ||||
| ## Crawler types | ||||
|  | ||||
| ### The `local` crawler | ||||
|  | ||||
| This crawler crawls a local directory. It is really simple and mostly useful for | ||||
| testing different setups. The various delay options are meant to make the | ||||
| crawler simulate a slower, network-based crawler. | ||||
|  | ||||
| - `target`: Path to the local directory to crawl. (Required) | ||||
| - `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests. | ||||
|   (Default: `0.0`) | ||||
| - `download_delay`: Artificial delay (in seconds) to simulate for download | ||||
|   requests. (Default: `0.0`) | ||||
| - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) | ||||
|  | ||||
| ### The `kit-ipd` crawler | ||||
|  | ||||
| This crawler crawls a KIT-IPD page by url. The root page can be crawled from | ||||
| outside the KIT network so you will be informed about any new/deleted files, | ||||
| but downloading files requires you to be within. Adding a show delay between | ||||
| requests is likely a good idea. | ||||
|  | ||||
| - `target`: URL to a KIT-IPD page | ||||
| - `link_regex`: A regex that is matched against the `href` part of links. If it | ||||
|   matches, the given link is downloaded as a file. This is used to extract | ||||
|   files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) | ||||
|  | ||||
| ### The `kit-ilias-web` crawler | ||||
|  | ||||
| This crawler crawls the KIT ILIAS instance. | ||||
|  | ||||
| ILIAS is not great at handling too many concurrent requests. To avoid | ||||
| unnecessary load, please limit `tasks` to `1`. | ||||
|  | ||||
| There is a spike in ILIAS usage at the beginning of lectures, so please don't | ||||
| run PFERD during those times. | ||||
|  | ||||
| If you're automatically running PFERD periodically (e. g. via cron or a systemd | ||||
| timer), please randomize the start time or at least don't use the full hour. For | ||||
| systemd timers, this can be accomplished using the `RandomizedDelaySec` option. | ||||
| Also, please schedule the script to run in periods of low activity. Running the | ||||
| script once per day should be fine. | ||||
|  | ||||
| - `target`: The ILIAS element to crawl. (Required) | ||||
|     - `desktop`: Crawl your personal desktop | ||||
|     - `<course id>`: Crawl the course with the given id | ||||
|     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||
|       at the bottom of its ILIAS page) | ||||
| - `auth`: Name of auth section to use for login. (Required) | ||||
| - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||
|   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||
| - `links`: How to represent external links. (Default: `fancy`) | ||||
|     - `ignore`: Don't download links. | ||||
|     - `plaintext`: A text file containing only the URL. | ||||
|     - `fancy`: A HTML file looking like the ILIAS link element. | ||||
|     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||
| - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||
|   redirect to the actual URL. Set to a negative value to disable the automatic | ||||
|   redirect. (Default: `-1`) | ||||
| - `videos`: Whether to download videos. (Default: `no`) | ||||
| - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||
|   `20.0`) | ||||
|  | ||||
| ## Authenticator types | ||||
|  | ||||
| ### The `simple` authenticator | ||||
|  | ||||
| With this authenticator, the username and password can be set directly in the | ||||
| config file. If the username or password are not specified, the user is prompted | ||||
| via the terminal. | ||||
|  | ||||
| - `username`: The username. (Optional) | ||||
| - `password`: The password. (Optional) | ||||
|  | ||||
| ### The `credential-file` authenticator | ||||
|  | ||||
| This authenticator reads a username and a password from a credential file. | ||||
|  | ||||
| - `path`: Path to the credential file. (Required) | ||||
|  | ||||
| The credential file has exactly two lines (trailing newline optional). The first | ||||
| line starts with `username=` and contains the username, the second line starts | ||||
| with `password=` and contains the password. The username and password may | ||||
| contain any characters except a line break. | ||||
|  | ||||
| ``` | ||||
| username=AzureDiamond | ||||
| password=hunter2 | ||||
| ``` | ||||
|  | ||||
| ### The `keyring` authenticator | ||||
|  | ||||
| This authenticator uses the system keyring to store passwords. The username can | ||||
| be set directly in the config file. If the username is not specified, the user | ||||
| is prompted via the terminal. If the keyring contains no entry or the entry is | ||||
| incorrect, the user is prompted for a password via the terminal and the password | ||||
| is stored in the keyring. | ||||
|  | ||||
| - `username`: The username. (Optional) | ||||
| - `keyring_name`: The service name PFERD uses for storing credentials. (Default: | ||||
|   `PFERD`) | ||||
|  | ||||
| ### The `pass` authenticator | ||||
|  | ||||
| This authenticator queries the [`pass` password manager][3] for a username and | ||||
| password. It tries to be mostly compatible with [browserpass][4] and | ||||
| [passff][5], so see those links for an overview of the format. If PFERD fails | ||||
| to load your password, you can use the `--explain` flag to see why. | ||||
|  | ||||
| - `passname`: The name of the password to use (Required) | ||||
| - `username_prefixes`: A comma-separated list of username line prefixes | ||||
|   (Default: `login,username,user`) | ||||
| - `password_prefixes`: A comma-separated list of password line prefixes | ||||
|   (Default: `password,pass,secret`) | ||||
|  | ||||
| [3]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager" | ||||
| [4]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store" | ||||
| [5]: <https://github.com/passff/passff#multi-line-format> "Multi-line format" | ||||
|  | ||||
| ### The `tfa` authenticator | ||||
|  | ||||
| This authenticator prompts the user on the console for a two-factor | ||||
| authentication token. The token is provided as password and it is not cached. | ||||
| This authenticator does not support usernames. | ||||
|  | ||||
| ## Transformation rules | ||||
|  | ||||
| Transformation rules are rules for renaming and excluding files and directories. | ||||
| They are specified line-by-line in a crawler's `transform` option. When a | ||||
| crawler needs to apply a rule to a path, it goes through this list top-to-bottom | ||||
| and applies the first matching rule. | ||||
|  | ||||
| To see this process in action, you can use the `--debug-transforms` or flag or | ||||
| the `--explain` flag. | ||||
|  | ||||
| Each rule has the format `SOURCE ARROW TARGET` (e. g. `foo/bar --> foo/baz`). | ||||
| The arrow specifies how the source and target are interpreted. The different | ||||
| kinds of arrows are documented below. | ||||
|  | ||||
| `SOURCE` and `TARGET` are either a bunch of characters without spaces (e. g. | ||||
| `foo/bar`) or string literals (e. g, `"foo/b a r"`). The former syntax has no | ||||
| concept of escaping characters, so the backslash is just another character. The | ||||
| string literals however support Python's escape syntax (e. g. | ||||
| `"foo\\bar\tbaz"`). This also means that in string literals, backslashes must be | ||||
| escaped. | ||||
|  | ||||
| `TARGET` can additionally be a single exclamation mark `!` (*not* `"!"`). When a | ||||
| rule with a `!` as target matches a path, the corresponding file or directory is | ||||
| ignored by the crawler instead of renamed. | ||||
|  | ||||
| `TARGET` can also be omitted entirely. When a rule without target matches a | ||||
| path, the path is returned unmodified. This is useful to prevent rules further | ||||
| down from matching instead. | ||||
|  | ||||
| Each arrow's behaviour can be modified slightly by changing the arrow's head | ||||
| from `>` to `>>`. When a rule with a `>>` arrow head matches a path, it doesn't | ||||
| return immediately like a normal arrow. Instead, it replaces the current path | ||||
| with its output and continues on to the next rule. In effect, this means that | ||||
| multiple rules can be applied sequentially. | ||||
|  | ||||
| ### The `-->` arrow | ||||
|  | ||||
| The `-->` arrow is a basic renaming operation for files and directories. If a | ||||
| path matches `SOURCE`, it is renamed to `TARGET`. | ||||
|  | ||||
| Example: `foo/bar --> baz` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Converts `foo/bar` into `baz` | ||||
| - Converts `foo/bar/wargl` into `bar/wargl` | ||||
|  | ||||
| Example: `foo/bar --> !` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Ignores `foo/bar` and any of its children | ||||
|  | ||||
| ### The `-name->` arrow | ||||
|  | ||||
| The `-name->` arrow lets you rename files and directories by their name, | ||||
| regardless of where they appear in the file tree. Because of this, its `SOURCE` | ||||
| must not contain multiple path segments, only a single name. This restriction | ||||
| does not apply to its `TARGET`. | ||||
|  | ||||
| Example: `foo -name-> bar/baz` | ||||
| - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||
| - Converts `hello/foo` into `hello/bar/baz` | ||||
| - Converts `foo/world` into `bar/baz/world` | ||||
| - Converts `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz` | ||||
|  | ||||
| Example: `foo -name-> !` | ||||
| - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||
| - Ignores any path containing a segment `foo` | ||||
|  | ||||
| ### The `-exact->` arrow | ||||
|  | ||||
| The `-exact->` arrow requires the path to match `SOURCE` exactly. The examples | ||||
| below show why this is useful. | ||||
|  | ||||
| Example: `foo/bar -exact-> baz` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Converts `foo/bar` into `baz` | ||||
| - Doesn't match `foo/bar/wargl` | ||||
|  | ||||
| Example: `foo/bar -exact-> !` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Ignores only `foo/bar`, not its children | ||||
|  | ||||
| ### The `-re->` arrow | ||||
|  | ||||
| The `-re->` arrow is like the `-->` arrow but with regular expressions. `SOURCE` | ||||
| is a regular expression and `TARGET` an f-string based template. If a path | ||||
| matches `SOURCE`, the output path is created using `TARGET` as template. | ||||
| `SOURCE` is automatically anchored. | ||||
|  | ||||
| `TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can | ||||
| be referred to as `{g<n>}` (e.g. `{g3}`). `{g0}` refers to the original path. | ||||
| If capturing group *n*'s contents are a valid integer, the integer value is | ||||
| available as `{i<n>}` (e.g. `{i3}`). If capturing group *n*'s contents are a | ||||
| valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). If a | ||||
| capturing group is not present (e.g. when matching the string `cd` with the | ||||
| regex `(ab)?cd`), the corresponding variables are not defined. | ||||
|  | ||||
| Python's format string syntax has rich options for formatting its arguments. For | ||||
| example, to left-pad the capturing group 3 with the digit `0` to width 5, you | ||||
| can use `{i3:05}`. | ||||
|  | ||||
| PFERD even allows you to write entire expressions inside the curly braces, for | ||||
| example `{g2.lower()}` or `{g3.replace(' ', '_')}`. | ||||
|  | ||||
| Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` | ||||
| - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||
| - Converts `foo/bar` into `BOOH/fear` | ||||
| - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||
| - Converts `foo/bar/baz` into `BOOH/fear/baz` | ||||
|  | ||||
| [6]: <https://docs.python.org/3/library/string.html#format-string-syntax> "Format String Syntax" | ||||
|  | ||||
| ### The `-name-re->` arrow | ||||
|  | ||||
| The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. | ||||
|  | ||||
| Example: `(.*)\.jpeg -name-re-> {g1}.jpg` | ||||
| - Doesn't match `foo/bar.png`, `baz.JPEG` or `hello,jpeg` | ||||
| - Converts `foo/bar.jpeg` into `foo/bar.jpg` | ||||
| - Converts `foo.jpeg/bar/baz.jpeg` into `foo.jpg/bar/baz.jpg` | ||||
|  | ||||
| Example: `\..+ -name-re-> !` | ||||
| - Doesn't match `.`, `test`, `a.b` | ||||
| - Ignores all files and directories starting with `.`. | ||||
|  | ||||
| ### The `-exact-re->` arrow | ||||
|  | ||||
| The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` | ||||
| arrows. | ||||
|  | ||||
| Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` | ||||
| - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||
| - Converts `foo/bar` into `BOOH/fear` | ||||
| - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||
| - Doesn't match `foo/bar/baz` | ||||
|  | ||||
| ### Example: Tutorials | ||||
|  | ||||
| You have an ILIAS course with lots of tutorials, but are only interested in a | ||||
| single one. | ||||
|  | ||||
| ``` | ||||
| tutorials/ | ||||
|   |- tut_01/ | ||||
|   |- tut_02/ | ||||
|   |- tut_03/ | ||||
|   ... | ||||
| ``` | ||||
|  | ||||
| You can use a mix of normal and exact arrows to get rid of the other ones and | ||||
| move the `tutorials/tut_02/` folder to `my_tut/`: | ||||
|  | ||||
| ``` | ||||
| tutorials/tut_02 --> my_tut | ||||
| tutorials -exact-> | ||||
| tutorials --> ! | ||||
| ``` | ||||
|  | ||||
| The second rule is required for many crawlers since they use the rules to decide | ||||
| which directories to crawl. If it was missing when the crawler looks at | ||||
| `tutorials/`, the third rule would match. This means the crawler would not crawl | ||||
| the `tutorials/` directory and thus not discover that `tutorials/tut02/` exists. | ||||
|  | ||||
| Since the second rule is only relevant for crawling, the `TARGET` is left out. | ||||
|  | ||||
| ### Example: Lecture slides | ||||
|  | ||||
| You have a course with slides like `Lecture 3: Linear functions.PDF` and you | ||||
| would like to rename them to `03_linear_functions.pdf`. | ||||
|  | ||||
| ``` | ||||
| Lectures/ | ||||
|   |- Lecture 1: Introduction.PDF | ||||
|   |- Lecture 2: Vectors and matrices.PDF | ||||
|   |- Lecture 3: Linear functions.PDF | ||||
|   ... | ||||
| ``` | ||||
|  | ||||
| To do this, you can use the most powerful of arrows: The regex arrow. | ||||
|  | ||||
| ``` | ||||
| "Lectures/Lecture (\\d+): (.*)\\.PDF" -re-> "Lectures/{i1:02}_{g2.lower().replace(' ', '_')}.pdf" | ||||
| ``` | ||||
|  | ||||
| Note the escaped backslashes on the `SOURCE` side. | ||||
|  | ||||
| ### Example: Crawl a Python project | ||||
|  | ||||
| You are crawling a Python project and want to ignore all hidden files (files | ||||
| whose name starts with a `.`), all `__pycache__` directories and all markdown | ||||
| files (for some weird reason). | ||||
|  | ||||
| ``` | ||||
| .gitignore | ||||
| .mypy_cache/ | ||||
| .venv/ | ||||
| CONFIG.md | ||||
| PFERD/ | ||||
|   |- __init__.py | ||||
|   |- __main__.py | ||||
|   |- __pycache__/ | ||||
|   |- authenticator.py | ||||
|   |- config.py | ||||
|   ... | ||||
| README.md | ||||
| ... | ||||
| ``` | ||||
|  | ||||
| For this task, the name arrows can be used. | ||||
|  | ||||
| ``` | ||||
| \..*        -name-re-> ! | ||||
| __pycache__ -name->    ! | ||||
| .*\.md      -name-re-> ! | ||||
| ``` | ||||
|  | ||||
| ### Example: Clean up names | ||||
|  | ||||
| You want to convert all paths into lowercase and replace spaces with underscores | ||||
| before applying any rules. This can be achieved using the `>>` arrow heads. | ||||
|  | ||||
| ``` | ||||
| (.*) -re->> "{g1.lower().replace(' ', '_')}" | ||||
|  | ||||
| <other rules go here> | ||||
| ``` | ||||
							
								
								
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,89 @@ | ||||
| # PFERD Development Guide | ||||
|  | ||||
| PFERD is packaged following the [Python Packaging User Guide][ppug] (in | ||||
| particular [this][ppug-1] and [this][ppug-2] guide). | ||||
|  | ||||
| [ppug]: <https://packaging.python.org/> "Python Packaging User Guide" | ||||
| [ppug-1]: <https://packaging.python.org/tutorials/packaging-projects/> "Packaging Python Projects" | ||||
| [ppug-2]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/> "Packaging and distributing projects" | ||||
|  | ||||
| ## Setting up a dev environment | ||||
|  | ||||
| The use of [venv][venv] is recommended. To initially set up a development | ||||
| environment, run these commands in the same directory as this file: | ||||
|  | ||||
| ``` | ||||
| $ python -m venv .venv | ||||
| $ . .venv/bin/activate | ||||
| $ ./scripts/setup | ||||
| ``` | ||||
|  | ||||
| The setup script installs a few required dependencies and tools. It also | ||||
| installs PFERD via `pip install --editable .`, which means that you can just run | ||||
| `pferd` as if it was installed normally. Since PFERD was installed with | ||||
| `--editable`, there is no need to re-run `pip install` when the source code is | ||||
| changed. | ||||
|  | ||||
| If you get any errors because pip can't update itself, try running | ||||
| `./scripts/setup --no-pip` instead of `./scripts/setup`. | ||||
|  | ||||
| For more details, see [this part of the Python Tutorial][venv-tut] and | ||||
| [this section on "development mode"][ppug-dev]. | ||||
|  | ||||
| [venv]: <https://docs.python.org/3/library/venv.html> "venv - Creation of virtual environments" | ||||
| [venv-tut]: <https://docs.python.org/3/tutorial/venv.html> "12. Virtual Environments and Packages" | ||||
| [ppug-dev]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/#working-in-development-mode> "Working in “development mode”" | ||||
|  | ||||
| ## Checking and formatting the code | ||||
|  | ||||
| To run a set of checks against the code, run `./scripts/check` in the repo's | ||||
| root directory. This script will run a few tools installed by `./scripts/setup` | ||||
| against the entire project. | ||||
|  | ||||
| To format the code, run `./scripts/format` in the repo's root directory. | ||||
|  | ||||
| Before committing changes, please make sure the checks return no warnings and | ||||
| the code is formatted. | ||||
|  | ||||
| ## Contributing | ||||
|  | ||||
| When submitting a PR that adds, changes or modifies a feature, please ensure | ||||
| that the corresponding documentation is updated as well. Also, please ensure | ||||
| that `./scripts/check` returns no warnings and the code has been run through | ||||
| `./scripts/format`. | ||||
|  | ||||
| In your first PR, please add your name to the `LICENSE` file. | ||||
|  | ||||
| ## Releasing a new version | ||||
|  | ||||
| This section describes the steps required to release a new version of PFERD. | ||||
| Usually, they don't need to performed manually and `scripts/bump-version` can be | ||||
| used instead. | ||||
|  | ||||
| 1. Update the version number in `PFERD/version.py` | ||||
| 2. Update `CHANGELOG.md` | ||||
| 3. Commit changes to `master` with message `Bump version to <version>` (e. g. `Bump version to 3.2.5`) | ||||
| 4. Create annotated tag named `v<version>` (e. g. `v3.2.5`) | ||||
|     - Copy changes from changelog | ||||
|     - Remove `#` symbols (which git would interpret as comments) | ||||
|     - As the first line, add `Version <version> - <date>` (e. g. `Version 3.2.5 - 2021-05-24`) | ||||
|     - Leave the second line empty | ||||
| 5. Fast-forward `latest` to `master` | ||||
| 6. Push `master`, `latest` and the new tag | ||||
|  | ||||
| Example tag annotation: | ||||
| ``` | ||||
| Version 3.2.5 - 2021-05-24 | ||||
|  | ||||
| Added | ||||
| - Support for concurrent downloads | ||||
| - Support for proper config files | ||||
| - This changelog | ||||
|  | ||||
| Changed | ||||
| - Rewrote almost everything | ||||
| - Redesigned CLI | ||||
|  | ||||
| Removed | ||||
| - Backwards compatibility with 2.x | ||||
| ``` | ||||
							
								
								
									
										3
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								LICENSE
									
									
									
									
									
								
							| @@ -1,4 +1,5 @@ | ||||
| Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw | ||||
| Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||
|                     TheChristophe, Scriptim, thelukasprobst, Toorero | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
| this software and associated documentation files (the "Software"), to deal in | ||||
|   | ||||
| @@ -1,8 +0,0 @@ | ||||
| # pylint: disable=invalid-name | ||||
|  | ||||
| """ | ||||
| This module exports only what you need for a basic configuration. If you want a | ||||
| more complex configuration, you need to import the other submodules manually. | ||||
| """ | ||||
|  | ||||
| from .pferd import Pferd | ||||
|   | ||||
							
								
								
									
										172
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,172 @@ | ||||
| import argparse | ||||
| import asyncio | ||||
| import configparser | ||||
| import os | ||||
| import sys | ||||
| from pathlib import Path | ||||
|  | ||||
| from PFERD.update import check_for_updates | ||||
|  | ||||
| from .auth import AuthLoadError | ||||
| from .cli import PARSER, ParserLoadError, load_default_section | ||||
| from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError | ||||
| from .logging import log | ||||
| from .pferd import Pferd, PferdLoadError | ||||
| from .transformer import RuleParseError | ||||
|  | ||||
|  | ||||
| def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: | ||||
|     log.explain_topic("Loading config") | ||||
|     parser = configparser.ConfigParser(interpolation=None) | ||||
|  | ||||
|     if args.command is None: | ||||
|         log.explain("No CLI command specified, loading config from file") | ||||
|         Config.load_parser(parser, path=args.config) | ||||
|     else: | ||||
|         log.explain("CLI command specified, loading config from its arguments") | ||||
|         if args.command: | ||||
|             args.command(args, parser) | ||||
|  | ||||
|     load_default_section(args, parser) | ||||
|  | ||||
|     return parser | ||||
|  | ||||
|  | ||||
| def load_config(args: argparse.Namespace) -> Config: | ||||
|     try: | ||||
|         return Config(load_config_parser(args)) | ||||
|     except ConfigLoadError as e: | ||||
|         log.error(str(e)) | ||||
|         log.error_contd(e.reason) | ||||
|         sys.exit(1) | ||||
|     except ParserLoadError as e: | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|  | ||||
|  | ||||
| def configure_logging_from_args(args: argparse.Namespace) -> None: | ||||
|     if args.explain is not None: | ||||
|         log.output_explain = args.explain | ||||
|     if args.status is not None: | ||||
|         log.output_status = args.status | ||||
|     if args.report is not None: | ||||
|         log.output_report = args.report | ||||
|  | ||||
|     # We want to prevent any unnecessary output if we're printing the config to | ||||
|     # stdout, otherwise it would not be a valid config file. | ||||
|     if args.dump_config_to == "-": | ||||
|         log.output_explain = False | ||||
|         log.output_status = False | ||||
|         log.output_report = False | ||||
|  | ||||
|  | ||||
| def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None: | ||||
|     # In configure_logging_from_args(), all normal logging is already disabled | ||||
|     # whenever we dump the config. We don't want to override that decision with | ||||
|     # values from the config file. | ||||
|     if args.dump_config_to == "-": | ||||
|         return | ||||
|  | ||||
|     try: | ||||
|         if args.explain is None: | ||||
|             log.output_explain = config.default_section.explain() | ||||
|         if args.status is None: | ||||
|             log.output_status = config.default_section.status() | ||||
|         if args.report is None: | ||||
|             log.output_report = config.default_section.report() | ||||
|     except ConfigOptionError as e: | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|  | ||||
|  | ||||
| def dump_config(args: argparse.Namespace, config: Config) -> None: | ||||
|     log.explain_topic("Dumping config") | ||||
|  | ||||
|     if args.dump_config and args.dump_config_to is not None: | ||||
|         log.error("--dump-config and --dump-config-to can't be specified at the same time") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     try: | ||||
|         if args.dump_config: | ||||
|             config.dump() | ||||
|         elif args.dump_config_to == "-": | ||||
|             config.dump_to_stdout() | ||||
|         else: | ||||
|             config.dump(Path(args.dump_config_to)) | ||||
|     except ConfigDumpError as e: | ||||
|         log.error(str(e)) | ||||
|         log.error_contd(e.reason) | ||||
|         sys.exit(1) | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     args = PARSER.parse_args() | ||||
|  | ||||
|     # Configuring logging happens in two stages because CLI args have | ||||
|     # precedence over config file options and loading the config already | ||||
|     # produces some kinds of log messages (usually only explain()-s). | ||||
|     configure_logging_from_args(args) | ||||
|  | ||||
|     config = load_config(args) | ||||
|  | ||||
|     # Now, after loading the config file, we can apply its logging settings in | ||||
|     # all places that were not already covered by CLI args. | ||||
|     configure_logging_from_config(args, config) | ||||
|  | ||||
|     if args.dump_config or args.dump_config_to is not None: | ||||
|         dump_config(args, config) | ||||
|         sys.exit() | ||||
|  | ||||
|     try: | ||||
|         pferd = Pferd(config, args.crawler, args.skip) | ||||
|     except PferdLoadError as e: | ||||
|         log.unlock() | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|  | ||||
|     try: | ||||
|         if os.name == "nt": | ||||
|             # A "workaround" for the windows event loop somehow crashing after | ||||
|             # asyncio.run() completes. See: | ||||
|             # https://bugs.python.org/issue39232 | ||||
|             # https://github.com/encode/httpx/issues/914#issuecomment-780023632 | ||||
|             # TODO Fix this properly | ||||
|             loop = asyncio.get_event_loop() | ||||
|             loop.run_until_complete(pferd.run(args.debug_transforms)) | ||||
|             loop.run_until_complete(asyncio.sleep(1)) | ||||
|             loop.close() | ||||
|         else: | ||||
|             log.explain_topic("Checking for updates") | ||||
|             if not args.skip_update_check: | ||||
|                 asyncio.run(check_for_updates()) | ||||
|             else: | ||||
|                 log.explain("Update check skipped due to configuration option") | ||||
|             asyncio.run(pferd.run(args.debug_transforms)) | ||||
|     except (ConfigOptionError, AuthLoadError) as e: | ||||
|         log.unlock() | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|     except RuleParseError as e: | ||||
|         log.unlock() | ||||
|         e.pretty_print() | ||||
|         sys.exit(1) | ||||
|     except KeyboardInterrupt: | ||||
|         log.unlock() | ||||
|         log.explain_topic("Interrupted, exiting immediately") | ||||
|         log.explain("Open files and connections are left for the OS to clean up") | ||||
|         pferd.print_report() | ||||
|         # TODO Clean up tmp files | ||||
|         # And when those files *do* actually get cleaned up properly, | ||||
|         # reconsider if this should really exit with 1 | ||||
|         sys.exit(1) | ||||
|     except Exception: | ||||
|         log.unlock() | ||||
|         log.unexpected_exception() | ||||
|         pferd.print_report() | ||||
|         sys.exit(1) | ||||
|     else: | ||||
|         pferd.print_report() | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| from configparser import SectionProxy | ||||
| from typing import Callable, Dict | ||||
|  | ||||
| from ..config import Config | ||||
| from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection  # noqa: F401 | ||||
| from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection | ||||
| from .keyring import KeyringAuthenticator, KeyringAuthSection | ||||
| from .pass_ import PassAuthenticator, PassAuthSection | ||||
| from .simple import SimpleAuthenticator, SimpleAuthSection | ||||
| from .tfa import TfaAuthenticator | ||||
|  | ||||
| AuthConstructor = Callable[[ | ||||
|     str,                # Name (without the "auth:" prefix) | ||||
|     SectionProxy,       # Authenticator's section of global config | ||||
|     Config,             # Global config | ||||
| ], Authenticator] | ||||
|  | ||||
| AUTHENTICATORS: Dict[str, AuthConstructor] = { | ||||
|     "credential-file": lambda n, s, c: | ||||
|         CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), | ||||
|     "keyring": lambda n, s, c: | ||||
|         KeyringAuthenticator(n, KeyringAuthSection(s)), | ||||
|     "pass": lambda n, s, c: | ||||
|         PassAuthenticator(n, PassAuthSection(s)), | ||||
|     "simple": lambda n, s, c: | ||||
|         SimpleAuthenticator(n, SimpleAuthSection(s)), | ||||
|     "tfa": lambda n, s, c: | ||||
|         TfaAuthenticator(n), | ||||
| } | ||||
							
								
								
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| from abc import ABC, abstractmethod | ||||
| from typing import Tuple | ||||
|  | ||||
| from ..config import Section | ||||
|  | ||||
|  | ||||
| class AuthLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class AuthError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class AuthSection(Section): | ||||
|     def type(self) -> str: | ||||
|         value = self.s.get("type") | ||||
|         if value is None: | ||||
|             self.missing_value("type") | ||||
|         return value | ||||
|  | ||||
|  | ||||
| class Authenticator(ABC): | ||||
|     def __init__(self, name: str) -> None: | ||||
|         """ | ||||
|         Initialize an authenticator from its name and its section in the config | ||||
|         file. | ||||
|  | ||||
|         If you are writing your own constructor for your own authenticator, | ||||
|         make sure to call this constructor first (via super().__init__). | ||||
|  | ||||
|         May throw an AuthLoadError. | ||||
|         """ | ||||
|  | ||||
|         self.name = name | ||||
|  | ||||
|     @abstractmethod | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         pass | ||||
|  | ||||
|     async def username(self) -> str: | ||||
|         username, _ = await self.credentials() | ||||
|         return username | ||||
|  | ||||
|     async def password(self) -> str: | ||||
|         _, password = await self.credentials() | ||||
|         return password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         """ | ||||
|         Tell the authenticator that some or all of its credentials are invalid. | ||||
|  | ||||
|         Authenticators should overwrite this function if they have a way to | ||||
|         deal with this issue that is likely to result in valid credentials | ||||
|         (e. g. prompting the user). | ||||
|         """ | ||||
|  | ||||
|         raise AuthError("Invalid credentials") | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         """ | ||||
|         Tell the authenticator that specifically its username is invalid. | ||||
|  | ||||
|         Authenticators should overwrite this function if they have a way to | ||||
|         deal with this issue that is likely to result in valid credentials | ||||
|         (e. g. prompting the user). | ||||
|         """ | ||||
|  | ||||
|         raise AuthError("Invalid username") | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         """ | ||||
|         Tell the authenticator that specifically its password is invalid. | ||||
|  | ||||
|         Authenticators should overwrite this function if they have a way to | ||||
|         deal with this issue that is likely to result in valid credentials | ||||
|         (e. g. prompting the user). | ||||
|         """ | ||||
|  | ||||
|         raise AuthError("Invalid password") | ||||
							
								
								
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | ||||
| from pathlib import Path | ||||
| from typing import Tuple | ||||
|  | ||||
| from ..config import Config | ||||
| from ..utils import fmt_real_path | ||||
| from .authenticator import Authenticator, AuthLoadError, AuthSection | ||||
|  | ||||
|  | ||||
| class CredentialFileAuthSection(AuthSection): | ||||
|     def path(self) -> Path: | ||||
|         value = self.s.get("path") | ||||
|         if value is None: | ||||
|             self.missing_value("path") | ||||
|         return Path(value) | ||||
|  | ||||
|  | ||||
| class CredentialFileAuthenticator(Authenticator): | ||||
|     def __init__(self, name: str, section: CredentialFileAuthSection, config: Config) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         path = config.default_section.working_dir() / section.path() | ||||
|         try: | ||||
|             with open(path, encoding="utf-8") as f: | ||||
|                 lines = list(f) | ||||
|         except UnicodeDecodeError: | ||||
|             raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8") | ||||
|         except OSError as e: | ||||
|             raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e | ||||
|  | ||||
|         if len(lines) != 2: | ||||
|             raise AuthLoadError("Credential file must be two lines long") | ||||
|         [uline, pline] = lines | ||||
|         uline = uline[:-1]  # Remove trailing newline | ||||
|         if pline.endswith("\n"): | ||||
|             pline = pline[:-1] | ||||
|  | ||||
|         if not uline.startswith("username="): | ||||
|             raise AuthLoadError("First line must start with 'username='") | ||||
|         if not pline.startswith("password="): | ||||
|             raise AuthLoadError("Second line must start with 'password='") | ||||
|  | ||||
|         self._username = uline[9:] | ||||
|         self._password = pline[9:] | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         return self._username, self._password | ||||
							
								
								
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,65 @@ | ||||
| from typing import Optional, Tuple | ||||
|  | ||||
| import keyring | ||||
|  | ||||
| from ..logging import log | ||||
| from ..utils import agetpass, ainput | ||||
| from ..version import NAME | ||||
| from .authenticator import Authenticator, AuthError, AuthSection | ||||
|  | ||||
|  | ||||
| class KeyringAuthSection(AuthSection): | ||||
|     def username(self) -> Optional[str]: | ||||
|         return self.s.get("username") | ||||
|  | ||||
|     def keyring_name(self) -> str: | ||||
|         return self.s.get("keyring_name", fallback=NAME) | ||||
|  | ||||
|  | ||||
| class KeyringAuthenticator(Authenticator): | ||||
|  | ||||
|     def __init__(self, name: str, section: KeyringAuthSection) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         self._username = section.username() | ||||
|         self._password: Optional[str] = None | ||||
|         self._keyring_name = section.keyring_name() | ||||
|  | ||||
|         self._password_invalidated = False | ||||
|         self._username_fixed = section.username() is not None | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         # Request the username | ||||
|         if self._username is None: | ||||
|             async with log.exclusive_output(): | ||||
|                 self._username = await ainput("Username: ") | ||||
|  | ||||
|         # First try looking it up in the keyring. | ||||
|         # Do not look it up if it was invalidated - we want to re-prompt in this case | ||||
|         if self._password is None and not self._password_invalidated: | ||||
|             self._password = keyring.get_password(self._keyring_name, self._username) | ||||
|  | ||||
|         # If that fails it wasn't saved in the keyring - we need to | ||||
|         # read it from the user and store it | ||||
|         if self._password is None: | ||||
|             async with log.exclusive_output(): | ||||
|                 self._password = await agetpass("Password: ") | ||||
|                 keyring.set_password(self._keyring_name, self._username, self._password) | ||||
|  | ||||
|         self._password_invalidated = False | ||||
|         return self._username, self._password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         if not self._username_fixed: | ||||
|             self.invalidate_username() | ||||
|         self.invalidate_password() | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         if self._username_fixed: | ||||
|             raise AuthError("Configured username is invalid") | ||||
|         else: | ||||
|             self._username = None | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         self._password = None | ||||
|         self._password_invalidated = True | ||||
							
								
								
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,98 @@ | ||||
| import re | ||||
| import subprocess | ||||
| from typing import List, Tuple | ||||
|  | ||||
| from ..logging import log | ||||
| from .authenticator import Authenticator, AuthError, AuthSection | ||||
|  | ||||
|  | ||||
| class PassAuthSection(AuthSection): | ||||
|     def passname(self) -> str: | ||||
|         if (value := self.s.get("passname")) is None: | ||||
|             self.missing_value("passname") | ||||
|         return value | ||||
|  | ||||
|     def username_prefixes(self) -> List[str]: | ||||
|         value = self.s.get("username_prefixes", "login,username,user") | ||||
|         return [prefix.lower() for prefix in value.split(",")] | ||||
|  | ||||
|     def password_prefixes(self) -> List[str]: | ||||
|         value = self.s.get("password_prefixes", "password,pass,secret") | ||||
|         return [prefix.lower() for prefix in value.split(",")] | ||||
|  | ||||
|  | ||||
| class PassAuthenticator(Authenticator): | ||||
|     PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)"  # to be used with fullmatch | ||||
|  | ||||
|     def __init__(self, name: str, section: PassAuthSection) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         self._passname = section.passname() | ||||
|         self._username_prefixes = section.username_prefixes() | ||||
|         self._password_prefixes = section.password_prefixes() | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         log.explain_topic("Obtaining credentials from pass") | ||||
|  | ||||
|         try: | ||||
|             log.explain(f"Calling 'pass show {self._passname}'") | ||||
|             result = subprocess.check_output(["pass", "show", self._passname], text=True) | ||||
|         except subprocess.CalledProcessError as e: | ||||
|             raise AuthError(f"Failed to get password info from {self._passname}: {e}") | ||||
|  | ||||
|         prefixed = {} | ||||
|         unprefixed = [] | ||||
|         for line in result.strip().splitlines(): | ||||
|             if match := re.fullmatch(self.PREFIXED_LINE_RE, line): | ||||
|                 prefix = match.group(1).lower() | ||||
|                 value = match.group(2) | ||||
|                 log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") | ||||
|                 if prefix in prefixed: | ||||
|                     raise AuthError(f"Prefix {prefix} specified multiple times") | ||||
|                 prefixed[prefix] = value | ||||
|             else: | ||||
|                 log.explain(f"Found unprefixed line {line!r}") | ||||
|                 unprefixed.append(line) | ||||
|  | ||||
|         username = None | ||||
|         for prefix in self._username_prefixes: | ||||
|             log.explain(f"Looking for username at prefix {prefix!r}") | ||||
|             if prefix in prefixed: | ||||
|                 username = prefixed[prefix] | ||||
|                 log.explain(f"Found username {username!r}") | ||||
|                 break | ||||
|  | ||||
|         password = None | ||||
|         for prefix in self._password_prefixes: | ||||
|             log.explain(f"Looking for password at prefix {prefix!r}") | ||||
|             if prefix in prefixed: | ||||
|                 password = prefixed[prefix] | ||||
|                 log.explain(f"Found password {password!r}") | ||||
|                 break | ||||
|  | ||||
|         if password is None and username is None: | ||||
|             log.explain("No username and password found so far") | ||||
|             log.explain("Using first unprefixed line as password") | ||||
|             log.explain("Using second unprefixed line as username") | ||||
|         elif password is None: | ||||
|             log.explain("No password found so far") | ||||
|             log.explain("Using first unprefixed line as password") | ||||
|         elif username is None: | ||||
|             log.explain("No username found so far") | ||||
|             log.explain("Using first unprefixed line as username") | ||||
|  | ||||
|         if password is None: | ||||
|             if not unprefixed: | ||||
|                 log.explain("Not enough unprefixed lines left") | ||||
|                 raise AuthError("Password could not be determined") | ||||
|             password = unprefixed.pop(0) | ||||
|             log.explain(f"Found password {password!r}") | ||||
|  | ||||
|         if username is None: | ||||
|             if not unprefixed: | ||||
|                 log.explain("Not enough unprefixed lines left") | ||||
|                 raise AuthError("Username could not be determined") | ||||
|             username = unprefixed.pop(0) | ||||
|             log.explain(f"Found username {username!r}") | ||||
|  | ||||
|         return username, password | ||||
							
								
								
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| from typing import Optional, Tuple | ||||
|  | ||||
| from ..logging import log | ||||
| from ..utils import agetpass, ainput | ||||
| from .authenticator import Authenticator, AuthError, AuthSection | ||||
|  | ||||
|  | ||||
| class SimpleAuthSection(AuthSection): | ||||
|     def username(self) -> Optional[str]: | ||||
|         return self.s.get("username") | ||||
|  | ||||
|     def password(self) -> Optional[str]: | ||||
|         return self.s.get("password") | ||||
|  | ||||
|  | ||||
| class SimpleAuthenticator(Authenticator): | ||||
|     def __init__(self, name: str, section: SimpleAuthSection) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         self._username = section.username() | ||||
|         self._password = section.password() | ||||
|  | ||||
|         self._username_fixed = self.username is not None | ||||
|         self._password_fixed = self.password is not None | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         if self._username is not None and self._password is not None: | ||||
|             return self._username, self._password | ||||
|  | ||||
|         async with log.exclusive_output(): | ||||
|             if self._username is None: | ||||
|                 self._username = await ainput("Username: ") | ||||
|             else: | ||||
|                 print(f"Username: {self._username}") | ||||
|  | ||||
|             if self._password is None: | ||||
|                 self._password = await agetpass("Password: ") | ||||
|  | ||||
|             # Intentionally returned inside the context manager so we know | ||||
|             # they're both not None | ||||
|             return self._username, self._password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         if self._username_fixed and self._password_fixed: | ||||
|             raise AuthError("Configured credentials are invalid") | ||||
|  | ||||
|         if not self._username_fixed: | ||||
|             self._username = None | ||||
|         if not self._password_fixed: | ||||
|             self._password = None | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         if self._username_fixed: | ||||
|             raise AuthError("Configured username is invalid") | ||||
|         else: | ||||
|             self._username = None | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         if self._password_fixed: | ||||
|             raise AuthError("Configured password is invalid") | ||||
|         else: | ||||
|             self._password = None | ||||
							
								
								
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| from typing import Tuple | ||||
|  | ||||
| from ..logging import log | ||||
| from ..utils import ainput | ||||
| from .authenticator import Authenticator, AuthError | ||||
|  | ||||
|  | ||||
| class TfaAuthenticator(Authenticator): | ||||
|     def __init__(self, name: str) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|     async def username(self) -> str: | ||||
|         raise AuthError("TFA authenticator does not support usernames") | ||||
|  | ||||
|     async def password(self) -> str: | ||||
|         async with log.exclusive_output(): | ||||
|             code = await ainput("TFA code: ") | ||||
|             return code | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         raise AuthError("TFA authenticator does not support usernames") | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         raise AuthError("TFA authenticator does not support usernames") | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         pass | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         pass | ||||
| @@ -1,125 +0,0 @@ | ||||
| """ | ||||
| General authenticators useful in many situations | ||||
| """ | ||||
|  | ||||
| import getpass | ||||
| from typing import Optional, Tuple | ||||
|  | ||||
|  | ||||
| class TfaAuthenticator: | ||||
|     # pylint: disable=too-few-public-methods | ||||
|     """ | ||||
|     An authenticator for a TFA token. Always prompts the user, as the token can not be cached. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, reason: str): | ||||
|         """ | ||||
|         Create a new tfa authenticator. | ||||
|  | ||||
|         Arguments: | ||||
|             reason {str} -- the reason for obtaining the credentials | ||||
|         """ | ||||
|         self._reason = reason | ||||
|  | ||||
|     def get_token(self) -> str: | ||||
|         # pylint: disable=no-self-use | ||||
|         """ | ||||
|         Prompts the user for the token and returns it. | ||||
|         """ | ||||
|         print(f"Enter credentials ({self._reason})") | ||||
|         return getpass.getpass("TFA Token: ") | ||||
|  | ||||
|  | ||||
| class UserPassAuthenticator: | ||||
|     """ | ||||
|     An authenticator for username-password combinations that prompts the user | ||||
|     for missing information. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             reason: str, | ||||
|             username: Optional[str] = None, | ||||
|             password: Optional[str] = None, | ||||
|     ) -> None: | ||||
|         """ | ||||
|         reason   - what the credentials are used for | ||||
|         username - the username (if already known) | ||||
|         password - the password (if already known) | ||||
|         """ | ||||
|  | ||||
|         self._reason = reason | ||||
|  | ||||
|         self._given_username = username | ||||
|         self._given_password = password | ||||
|  | ||||
|         self._username = username | ||||
|         self._password = password | ||||
|  | ||||
|     def get_credentials(self) -> Tuple[str, str]: | ||||
|         """ | ||||
|         Returns a tuple (username, password). Prompts user for username or | ||||
|         password when necessary. | ||||
|         """ | ||||
|  | ||||
|         if self._username is None and self._given_username is not None: | ||||
|             self._username = self._given_username | ||||
|  | ||||
|         if self._password is None and self._given_password is not None: | ||||
|             self._password = self._given_password | ||||
|  | ||||
|         if self._username is None or self._password is None: | ||||
|             print(f"Enter credentials ({self._reason})") | ||||
|  | ||||
|         username: str | ||||
|         if self._username is None: | ||||
|             username = input("Username: ") | ||||
|             self._username = username | ||||
|         else: | ||||
|             username = self._username | ||||
|  | ||||
|         password: str | ||||
|         if self._password is None: | ||||
|             password = getpass.getpass(prompt="Password: ") | ||||
|             self._password = password | ||||
|         else: | ||||
|             password = self._password | ||||
|  | ||||
|         return (username, password) | ||||
|  | ||||
|     @property | ||||
|     def username(self) -> str: | ||||
|         """ | ||||
|         The username. Accessing this property may cause the authenticator to | ||||
|         prompt the user. | ||||
|         """ | ||||
|  | ||||
|         (username, _) = self.get_credentials() | ||||
|         return username | ||||
|  | ||||
|     @property | ||||
|     def password(self) -> str: | ||||
|         """ | ||||
|         The password. Accessing this property may cause the authenticator to | ||||
|         prompt the user. | ||||
|         """ | ||||
|  | ||||
|         (_, password) = self.get_credentials() | ||||
|         return password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         """ | ||||
|         Marks the credentials as invalid. If only a username was supplied in | ||||
|         the constructor, assumes that the username is valid and only the | ||||
|         password is invalid. If only a password was supplied in the | ||||
|         constructor, assumes that the password is valid and only the username | ||||
|         is invalid. Otherwise, assumes that username and password are both | ||||
|         invalid. | ||||
|         """ | ||||
|  | ||||
|         self._username = None | ||||
|         self._password = None | ||||
|  | ||||
|         if self._given_username is not None and self._given_password is not None: | ||||
|             self._given_username = None | ||||
|             self._given_password = None | ||||
							
								
								
									
										13
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| # isort: skip_file | ||||
|  | ||||
| # The order of imports matters because each command module registers itself | ||||
| # with the parser from ".parser" and the import order affects the order in | ||||
| # which they appear in the help. Because of this, isort is disabled for this | ||||
| # file. Also, since we're reexporting or just using the side effect of | ||||
| # importing itself, we get a few linting warnings, which we're disabling as | ||||
| # well. | ||||
|  | ||||
| from . import command_local  # noqa: F401 imported but unused | ||||
| from . import command_kit_ilias_web  # noqa: F401 imported but unused | ||||
| from . import command_kit_ipd  # noqa: F401 imported but unused | ||||
| from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused | ||||
							
								
								
									
										120
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										120
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,120 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..crawl.ilias.file_templates import Links | ||||
| from ..logging import log | ||||
| from .parser import (CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, ParserLoadError, load_crawler, | ||||
|                      show_value_error) | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "kit-ilias-web", | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="kit-ilias-web crawler arguments", | ||||
|     description="arguments for the 'kit-ilias-web' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=str, | ||||
|     metavar="TARGET", | ||||
|     help="course id, 'desktop', or ILIAS URL to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--username", "-u", | ||||
|     type=str, | ||||
|     metavar="USERNAME", | ||||
|     help="user name for authentication" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--keyring", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="use the system keyring to store and retrieve passwords" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--credential-file", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="read username and password from a credential file" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--links", | ||||
|     type=show_value_error(Links.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="how to represent external links" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--link-redirect-delay", | ||||
|     type=int, | ||||
|     metavar="SECONDS", | ||||
|     help="time before 'fancy' links redirect to to their target (-1 to disable)" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--videos", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="crawl and download videos" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--forums", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="crawl and download forum posts" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--http-timeout", "-t", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="timeout for all HTTP requests" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'kit-ilias-web'") | ||||
|  | ||||
|     parser["crawl:ilias"] = {} | ||||
|     section = parser["crawl:ilias"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "kit-ilias-web" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     section["auth"] = "auth:ilias" | ||||
|     if args.links is not None: | ||||
|         section["links"] = str(args.links.value) | ||||
|     if args.link_redirect_delay is not None: | ||||
|         section["link_redirect_delay"] = str(args.link_redirect_delay) | ||||
|     if args.videos is not None: | ||||
|         section["videos"] = "yes" if args.videos else "no" | ||||
|     if args.forums is not None: | ||||
|         section["forums"] = "yes" if args.forums else "no" | ||||
|     if args.http_timeout is not None: | ||||
|         section["http_timeout"] = str(args.http_timeout) | ||||
|  | ||||
|     parser["auth:ilias"] = {} | ||||
|     auth_section = parser["auth:ilias"] | ||||
|     if args.credential_file is not None: | ||||
|         if args.username is not None: | ||||
|             raise ParserLoadError("--credential-file and --username can't be used together") | ||||
|         if args.keyring: | ||||
|             raise ParserLoadError("--credential-file and --keyring can't be used together") | ||||
|         auth_section["type"] = "credential-file" | ||||
|         auth_section["path"] = str(args.credential_file) | ||||
|     elif args.keyring: | ||||
|         auth_section["type"] = "keyring" | ||||
|     else: | ||||
|         auth_section["type"] = "simple" | ||||
|     if args.username is not None: | ||||
|         auth_section["username"] = args.username | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
							
								
								
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..logging import log | ||||
| from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "kit-ipd", | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="kit ipd crawler arguments", | ||||
|     description="arguments for the 'kit-ipd' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--link-regex", | ||||
|     type=str, | ||||
|     metavar="REGEX", | ||||
|     help="href-matching regex to identify downloadable files" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=str, | ||||
|     metavar="TARGET", | ||||
|     help="url to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'kit-ipd'") | ||||
|  | ||||
|     parser["crawl:kit-ipd"] = {} | ||||
|     section = parser["crawl:kit-ipd"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "kit-ipd" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     if args.link_regex: | ||||
|         section["link_regex"] = str(args.link_regex) | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
							
								
								
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..logging import log | ||||
| from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "local", | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="local crawler arguments", | ||||
|     description="arguments for the 'local' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=Path, | ||||
|     metavar="TARGET", | ||||
|     help="directory to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--crawl-delay", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="artificial delay to simulate for crawl requests" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--download-delay", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="artificial delay to simulate for download requests" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--download-speed", | ||||
|     type=int, | ||||
|     metavar="BYTES_PER_SECOND", | ||||
|     help="download speed to simulate" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'local'") | ||||
|  | ||||
|     parser["crawl:local"] = {} | ||||
|     section = parser["crawl:local"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "local" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     if args.crawl_delay is not None: | ||||
|         section["crawl_delay"] = str(args.crawl_delay) | ||||
|     if args.download_delay is not None: | ||||
|         section["download_delay"] = str(args.download_delay) | ||||
|     if args.download_speed is not None: | ||||
|         section["download_speed"] = str(args.download_speed) | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
							
								
								
									
										243
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										243
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,243 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from argparse import ArgumentTypeError | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, List, Optional, Sequence, Union | ||||
|  | ||||
| from ..output_dir import OnConflict, Redownload | ||||
| from ..version import NAME, VERSION | ||||
|  | ||||
|  | ||||
| class ParserLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| # TODO Replace with argparse version when updating to 3.9? | ||||
| class BooleanOptionalAction(argparse.Action): | ||||
|     def __init__( | ||||
|             self, | ||||
|             option_strings: List[str], | ||||
|             dest: Any, | ||||
|             default: Any = None, | ||||
|             type: Any = None, | ||||
|             choices: Any = None, | ||||
|             required: Any = False, | ||||
|             help: Any = None, | ||||
|             metavar: Any = None, | ||||
|     ): | ||||
|         if len(option_strings) != 1: | ||||
|             raise ValueError("There must be exactly one option string") | ||||
|         [self.name] = option_strings | ||||
|         if not self.name.startswith("--"): | ||||
|             raise ValueError(f"{self.name!r} doesn't start with '--'") | ||||
|         if self.name.startswith("--no-"): | ||||
|             raise ValueError(f"{self.name!r} starts with '--no-'") | ||||
|  | ||||
|         options = [self.name, "--no-" + self.name[2:]] | ||||
|  | ||||
|         super().__init__( | ||||
|             options, | ||||
|             dest, | ||||
|             nargs=0, | ||||
|             default=default, | ||||
|             type=type, | ||||
|             choices=choices, | ||||
|             required=required, | ||||
|             help=help, | ||||
|             metavar=metavar, | ||||
|         ) | ||||
|  | ||||
|     def __call__( | ||||
|             self, | ||||
|             parser: argparse.ArgumentParser, | ||||
|             namespace: argparse.Namespace, | ||||
|             values: Union[str, Sequence[Any], None], | ||||
|             option_string: Optional[str] = None, | ||||
|     ) -> None: | ||||
|         if option_string and option_string in self.option_strings: | ||||
|             value = not option_string.startswith("--no-") | ||||
|             setattr(namespace, self.dest, value) | ||||
|  | ||||
|     def format_usage(self) -> str: | ||||
|         return "--[no-]" + self.name[2:] | ||||
|  | ||||
|  | ||||
| def show_value_error(inner: Callable[[str], Any]) -> Callable[[str], Any]: | ||||
|     """ | ||||
|     Some validation functions (like the from_string in our enums) raise a ValueError. | ||||
|     Argparse only pretty-prints ArgumentTypeErrors though, so we need to wrap our ValueErrors. | ||||
|     """ | ||||
|     def wrapper(input: str) -> Any: | ||||
|         try: | ||||
|             return inner(input) | ||||
|         except ValueError as e: | ||||
|             raise ArgumentTypeError(e) | ||||
|     return wrapper | ||||
|  | ||||
|  | ||||
| CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) | ||||
| CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( | ||||
|     title="general crawler arguments", | ||||
|     description="arguments common to all crawlers", | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--redownload", "-r", | ||||
|     type=show_value_error(Redownload.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="when to download a file that's already present locally" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--on-conflict", | ||||
|     type=show_value_error(OnConflict.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="what to do when local and remote files or directories differ" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--transform", "-T", | ||||
|     action="append", | ||||
|     type=str, | ||||
|     metavar="RULE", | ||||
|     help="add a single transformation rule. Can be specified multiple times" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--tasks", "-n", | ||||
|     type=int, | ||||
|     metavar="N", | ||||
|     help="maximum number of concurrent tasks (crawling, downloading)" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--downloads", "-N", | ||||
|     type=int, | ||||
|     metavar="N", | ||||
|     help="maximum number of tasks that may download data at the same time" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--task-delay", "-d", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="time the crawler should wait between subsequent tasks" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--windows-paths", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="whether to repair invalid paths on windows" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load_crawler( | ||||
|         args: argparse.Namespace, | ||||
|         section: configparser.SectionProxy, | ||||
| ) -> None: | ||||
|     if args.redownload is not None: | ||||
|         section["redownload"] = args.redownload.value | ||||
|     if args.on_conflict is not None: | ||||
|         section["on_conflict"] = args.on_conflict.value | ||||
|     if args.transform is not None: | ||||
|         section["transform"] = "\n" + "\n".join(args.transform) | ||||
|     if args.tasks is not None: | ||||
|         section["tasks"] = str(args.tasks) | ||||
|     if args.downloads is not None: | ||||
|         section["downloads"] = str(args.downloads) | ||||
|     if args.task_delay is not None: | ||||
|         section["task_delay"] = str(args.task_delay) | ||||
|     if args.windows_paths is not None: | ||||
|         section["windows_paths"] = "yes" if args.windows_paths else "no" | ||||
|  | ||||
|  | ||||
| PARSER = argparse.ArgumentParser() | ||||
| PARSER.set_defaults(command=None) | ||||
| PARSER.add_argument( | ||||
|     "--version", | ||||
|     action="version", | ||||
|     version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--skip-update-check", | ||||
|     action="store_true", | ||||
|     help="disable automatic update checks at startup" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--config", "-c", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="custom config file" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--dump-config", | ||||
|     action="store_true", | ||||
|     help="dump current configuration to the default config path and exit" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--dump-config-to", | ||||
|     metavar="PATH", | ||||
|     help="dump current configuration to a file and exit." | ||||
|     " Use '-' as path to print to stdout instead" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--debug-transforms", | ||||
|     action="store_true", | ||||
|     help="apply transform rules to files of previous run" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--crawler", "-C", | ||||
|     action="append", | ||||
|     type=str, | ||||
|     metavar="NAME", | ||||
|     help="only execute a single crawler." | ||||
|     " Can be specified multiple times to execute multiple crawlers" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--skip", "-S", | ||||
|     action="append", | ||||
|     type=str, | ||||
|     metavar="NAME", | ||||
|     help="don't execute this particular crawler." | ||||
|     " Can be specified multiple times to skip multiple crawlers" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--working-dir", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="custom working directory" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--explain", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="log and explain in detail what PFERD is doing" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--status", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="print status updates while PFERD is crawling" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--report", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="print a report of all local changes before exiting" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--share-cookies", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="whether crawlers should share cookies where applicable" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load_default_section( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     section = parser[parser.default_section] | ||||
|  | ||||
|     if args.working_dir is not None: | ||||
|         section["working_dir"] = str(args.working_dir) | ||||
|     if args.explain is not None: | ||||
|         section["explain"] = "yes" if args.explain else "no" | ||||
|     if args.status is not None: | ||||
|         section["status"] = "yes" if args.status else "no" | ||||
|     if args.report is not None: | ||||
|         section["report"] = "yes" if args.report else "no" | ||||
|     if args.share_cookies is not None: | ||||
|         section["share_cookies"] = "yes" if args.share_cookies else "no" | ||||
|  | ||||
|  | ||||
| SUBPARSERS = PARSER.add_subparsers(title="crawlers") | ||||
							
								
								
									
										190
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										190
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,190 @@ | ||||
| import asyncio | ||||
| import os | ||||
| import sys | ||||
| from configparser import ConfigParser, SectionProxy | ||||
| from pathlib import Path | ||||
| from typing import Any, List, NoReturn, Optional, Tuple | ||||
|  | ||||
| from rich.markup import escape | ||||
|  | ||||
| from .logging import log | ||||
| from .utils import fmt_real_path, prompt_yes_no | ||||
|  | ||||
|  | ||||
| class ConfigLoadError(Exception): | ||||
|     """ | ||||
|     Something went wrong while loading the config from a file. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: Path, reason: str): | ||||
|         super().__init__(f"Failed to load config from {fmt_real_path(path)}") | ||||
|         self.path = path | ||||
|         self.reason = reason | ||||
|  | ||||
|  | ||||
| class ConfigOptionError(Exception): | ||||
|     """ | ||||
|     An option in the config file has an invalid or missing value. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, section: str, key: str, desc: str): | ||||
|         super().__init__(f"Section {section!r}, key {key!r}: {desc}") | ||||
|         self.section = section | ||||
|         self.key = key | ||||
|         self.desc = desc | ||||
|  | ||||
|  | ||||
| class ConfigDumpError(Exception): | ||||
|     def __init__(self, path: Path, reason: str): | ||||
|         super().__init__(f"Failed to dump config to {fmt_real_path(path)}") | ||||
|         self.path = path | ||||
|         self.reason = reason | ||||
|  | ||||
|  | ||||
| class Section: | ||||
|     """ | ||||
|     Base class for the crawler and auth section classes. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, section: SectionProxy): | ||||
|         self.s = section | ||||
|  | ||||
|     def error(self, key: str, desc: str) -> NoReturn: | ||||
|         raise ConfigOptionError(self.s.name, key, desc) | ||||
|  | ||||
|     def invalid_value( | ||||
|             self, | ||||
|             key: str, | ||||
|             value: Any, | ||||
|             reason: Optional[str], | ||||
|     ) -> NoReturn: | ||||
|         if reason is None: | ||||
|             self.error(key, f"Invalid value {value!r}") | ||||
|         else: | ||||
|             self.error(key, f"Invalid value {value!r}: {reason}") | ||||
|  | ||||
|     def missing_value(self, key: str) -> NoReturn: | ||||
|         self.error(key, "Missing value") | ||||
|  | ||||
|  | ||||
| class DefaultSection(Section): | ||||
|     def working_dir(self) -> Path: | ||||
|         # TODO Change to working dir instead of manually prepending it to paths | ||||
|         pathstr = self.s.get("working_dir", ".") | ||||
|         return Path(pathstr).expanduser() | ||||
|  | ||||
|     def explain(self) -> bool: | ||||
|         return self.s.getboolean("explain", fallback=False) | ||||
|  | ||||
|     def status(self) -> bool: | ||||
|         return self.s.getboolean("status", fallback=True) | ||||
|  | ||||
|     def report(self) -> bool: | ||||
|         return self.s.getboolean("report", fallback=True) | ||||
|  | ||||
|     def share_cookies(self) -> bool: | ||||
|         return self.s.getboolean("share_cookies", fallback=True) | ||||
|  | ||||
|  | ||||
| class Config: | ||||
|     @staticmethod | ||||
|     def _default_path() -> Path: | ||||
|         if os.name == "posix": | ||||
|             return Path("~/.config/PFERD/pferd.cfg").expanduser() | ||||
|         elif os.name == "nt": | ||||
|             return Path("~/AppData/Roaming/PFERD/pferd.cfg").expanduser() | ||||
|         else: | ||||
|             return Path("~/.pferd.cfg").expanduser() | ||||
|  | ||||
|     def __init__(self, parser: ConfigParser): | ||||
|         self._parser = parser | ||||
|         self._default_section = DefaultSection(parser[parser.default_section]) | ||||
|  | ||||
|     @property | ||||
|     def default_section(self) -> DefaultSection: | ||||
|         return self._default_section | ||||
|  | ||||
|     @staticmethod | ||||
|     def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: | ||||
|         """ | ||||
|         May throw a ConfigLoadError. | ||||
|         """ | ||||
|  | ||||
|         if path: | ||||
|             log.explain("Path specified on CLI") | ||||
|         else: | ||||
|             log.explain("Using default path") | ||||
|             path = Config._default_path() | ||||
|         log.explain(f"Loading {fmt_real_path(path)}") | ||||
|  | ||||
|         # Using config.read_file instead of config.read because config.read | ||||
|         # would just ignore a missing file and carry on. | ||||
|         try: | ||||
|             with open(path, encoding="utf-8") as f: | ||||
|                 parser.read_file(f, source=str(path)) | ||||
|         except FileNotFoundError: | ||||
|             raise ConfigLoadError(path, "File does not exist") | ||||
|         except IsADirectoryError: | ||||
|             raise ConfigLoadError(path, "That's a directory, not a file") | ||||
|         except PermissionError: | ||||
|             raise ConfigLoadError(path, "Insufficient permissions") | ||||
|         except UnicodeDecodeError: | ||||
|             raise ConfigLoadError(path, "File is not encoded using UTF-8") | ||||
|  | ||||
|     def dump(self, path: Optional[Path] = None) -> None: | ||||
|         """ | ||||
|         May throw a ConfigDumpError. | ||||
|         """ | ||||
|  | ||||
|         if path: | ||||
|             log.explain("Using custom path") | ||||
|         else: | ||||
|             log.explain("Using default path") | ||||
|             path = self._default_path() | ||||
|  | ||||
|         log.explain(f"Dumping to {fmt_real_path(path)}") | ||||
|         log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}") | ||||
|  | ||||
|         try: | ||||
|             path.parent.mkdir(parents=True, exist_ok=True) | ||||
|         except PermissionError: | ||||
|             raise ConfigDumpError(path, "Could not create parent directory") | ||||
|  | ||||
|         try: | ||||
|             # Ensuring we don't accidentally overwrite any existing files by | ||||
|             # always asking before overwriting a file. | ||||
|             try: | ||||
|                 # x = open for exclusive creation, failing if the file already | ||||
|                 # exists | ||||
|                 with open(path, "x", encoding="utf-8") as f: | ||||
|                     self._parser.write(f) | ||||
|             except FileExistsError: | ||||
|                 print("That file already exists.") | ||||
|                 if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): | ||||
|                     with open(path, "w", encoding="utf-8") as f: | ||||
|                         self._parser.write(f) | ||||
|                 else: | ||||
|                     raise ConfigDumpError(path, "File already exists") | ||||
|         except IsADirectoryError: | ||||
|             raise ConfigDumpError(path, "That's a directory, not a file") | ||||
|         except PermissionError: | ||||
|             raise ConfigDumpError(path, "Insufficient permissions") | ||||
|  | ||||
|     def dump_to_stdout(self) -> None: | ||||
|         self._parser.write(sys.stdout) | ||||
|  | ||||
|     def crawl_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||
|         result = [] | ||||
|         for name, proxy in self._parser.items(): | ||||
|             if name.startswith("crawl:"): | ||||
|                 result.append((name, proxy)) | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def auth_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||
|         result = [] | ||||
|         for name, proxy in self._parser.items(): | ||||
|             if name.startswith("auth:"): | ||||
|                 result.append((name, proxy)) | ||||
|  | ||||
|         return result | ||||
| @@ -1,69 +0,0 @@ | ||||
| """A helper for requests cookies.""" | ||||
|  | ||||
| import logging | ||||
| from http.cookiejar import LoadError, LWPCookieJar | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
|  | ||||
| import requests | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class CookieJar: | ||||
|     """A cookie jar that can be persisted.""" | ||||
|  | ||||
|     def __init__(self, cookie_file: Optional[Path] = None) -> None: | ||||
|         """Create a new cookie jar at the given path. | ||||
|  | ||||
|         If the path is None, the cookies will not be persisted. | ||||
|         """ | ||||
|         self._cookies: LWPCookieJar | ||||
|         if cookie_file is None: | ||||
|             self._cookies = LWPCookieJar() | ||||
|         else: | ||||
|             self._cookies = LWPCookieJar(str(cookie_file.resolve())) | ||||
|  | ||||
|     @property | ||||
|     def cookies(self) -> LWPCookieJar: | ||||
|         """Return the requests cookie jar.""" | ||||
|         return self._cookies | ||||
|  | ||||
|     def load_cookies(self) -> None: | ||||
|         """Load all cookies from the file given in the constructor.""" | ||||
|         if self._cookies.filename is None: | ||||
|             return | ||||
|  | ||||
|         try: | ||||
|             LOGGER.info("Loading old cookies from %s", self._cookies.filename) | ||||
|             self._cookies.load(ignore_discard=True) | ||||
|         except (FileNotFoundError, LoadError): | ||||
|             LOGGER.warning( | ||||
|                 "No valid cookie file found at %s, continuing with no cookies", | ||||
|                 self._cookies.filename | ||||
|             ) | ||||
|  | ||||
|     def save_cookies(self, reason: Optional[str] = None) -> None: | ||||
|         """Save the cookies in the file given in the constructor.""" | ||||
|         if self._cookies.filename is None: | ||||
|             return | ||||
|  | ||||
|         if reason is None: | ||||
|             LOGGER.info("Saving cookies") | ||||
|         else: | ||||
|             LOGGER.info("Saving cookies (%s)", reason) | ||||
|  | ||||
|         # TODO figure out why ignore_discard is set | ||||
|         # TODO possibly catch a few more exceptions | ||||
|         self._cookies.save(ignore_discard=True) | ||||
|  | ||||
|     def create_session(self) -> requests.Session: | ||||
|         """Create a new session using the cookie jar.""" | ||||
|         sess = requests.Session() | ||||
|  | ||||
|         # From the request docs: "All requests code should work out of the box | ||||
|         # with externally provided instances of CookieJar, e.g. LWPCookieJar | ||||
|         # and FileCookieJar." | ||||
|         sess.cookies = self.cookies  # type: ignore | ||||
|  | ||||
|         return sess | ||||
							
								
								
									
										25
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| from configparser import SectionProxy | ||||
| from typing import Callable, Dict | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config | ||||
| from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401 | ||||
| from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
| from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection | ||||
| from .local_crawler import LocalCrawler, LocalCrawlerSection | ||||
|  | ||||
| CrawlerConstructor = Callable[[ | ||||
|     str,                       # Name (without the "crawl:" prefix) | ||||
|     SectionProxy,              # Crawler's section of global config | ||||
|     Config,                    # Global config | ||||
|     Dict[str, Authenticator],  # Loaded authenticators by name | ||||
| ], Crawler] | ||||
|  | ||||
| CRAWLERS: Dict[str, CrawlerConstructor] = { | ||||
|     "local": lambda n, s, c, a: | ||||
|         LocalCrawler(n, LocalCrawlerSection(s), c), | ||||
|     "kit-ilias-web": lambda n, s, c, a: | ||||
|         KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), | ||||
|     "kit-ipd": lambda n, s, c, a: | ||||
|         KitIpdCrawler(n, KitIpdCrawlerSection(s), c), | ||||
| } | ||||
							
								
								
									
										369
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										369
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,369 @@ | ||||
| import asyncio | ||||
| import os | ||||
| from abc import ABC, abstractmethod | ||||
| from collections.abc import Awaitable, Coroutine | ||||
| from datetime import datetime | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config, Section | ||||
| from ..deduplicator import Deduplicator | ||||
| from ..limiter import Limiter | ||||
| from ..logging import ProgressBar, log | ||||
| from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload | ||||
| from ..report import MarkConflictError, MarkDuplicateError, Report | ||||
| from ..transformer import Transformer | ||||
| from ..utils import ReusableAsyncContextManager, fmt_path | ||||
|  | ||||
|  | ||||
| class CrawlWarning(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class CrawlError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) | ||||
|  | ||||
|  | ||||
| def noncritical(f: Wrapped) -> Wrapped: | ||||
|     """ | ||||
|     Catches and logs a few noncritical exceptions occurring during the function | ||||
|     call, mainly CrawlWarning. | ||||
|  | ||||
|     If any exception occurs during the function call, the crawler's error_free | ||||
|     variable is set to False. This includes noncritical exceptions. | ||||
|  | ||||
|     Warning: Must only be applied to member functions of the Crawler class! | ||||
|     """ | ||||
|  | ||||
|     def wrapper(*args: Any, **kwargs: Any) -> None: | ||||
|         if not (args and isinstance(args[0], Crawler)): | ||||
|             raise RuntimeError("@noncritical must only applied to Crawler methods") | ||||
|  | ||||
|         crawler = args[0] | ||||
|  | ||||
|         try: | ||||
|             f(*args, **kwargs) | ||||
|         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||
|             crawler.report.add_warning(str(e)) | ||||
|             log.warn(str(e)) | ||||
|             crawler.error_free = False | ||||
|         except Exception as e: | ||||
|             crawler.error_free = False | ||||
|             crawler.report.add_error(str(e)) | ||||
|             raise | ||||
|  | ||||
|     return wrapper  # type: ignore | ||||
|  | ||||
|  | ||||
| AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]]) | ||||
|  | ||||
|  | ||||
| def anoncritical(f: AWrapped) -> AWrapped: | ||||
|     """ | ||||
|     An async version of @noncritical. | ||||
|  | ||||
|     Catches and logs a few noncritical exceptions occurring during the function | ||||
|     call, mainly CrawlWarning. | ||||
|  | ||||
|     If any exception occurs during the function call, the crawler's error_free | ||||
|     variable is set to False. This includes noncritical exceptions. | ||||
|  | ||||
|     Warning: Must only be applied to member functions of the Crawler class! | ||||
|     """ | ||||
|  | ||||
|     async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||
|         if not (args and isinstance(args[0], Crawler)): | ||||
|             raise RuntimeError("@anoncritical must only applied to Crawler methods") | ||||
|  | ||||
|         crawler = args[0] | ||||
|  | ||||
|         try: | ||||
|             return await f(*args, **kwargs) | ||||
|         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||
|             log.warn(str(e)) | ||||
|             crawler.error_free = False | ||||
|             crawler.report.add_warning(str(e)) | ||||
|         except Exception as e: | ||||
|             crawler.error_free = False | ||||
|             crawler.report.add_error(str(e)) | ||||
|             raise | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     return wrapper  # type: ignore | ||||
|  | ||||
|  | ||||
| class CrawlToken(ReusableAsyncContextManager[ProgressBar]): | ||||
|     def __init__(self, limiter: Limiter, path: PurePath): | ||||
|         super().__init__() | ||||
|  | ||||
|         self._limiter = limiter | ||||
|         self._path = path | ||||
|  | ||||
|     @property | ||||
|     def path(self) -> PurePath: | ||||
|         return self._path | ||||
|  | ||||
|     async def _on_aenter(self) -> ProgressBar: | ||||
|         self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path))) | ||||
|         await self._stack.enter_async_context(self._limiter.limit_crawl()) | ||||
|         bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path))) | ||||
|  | ||||
|         return bar | ||||
|  | ||||
|  | ||||
| class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): | ||||
|     def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath): | ||||
|         super().__init__() | ||||
|  | ||||
|         self._limiter = limiter | ||||
|         self._fs_token = fs_token | ||||
|         self._path = path | ||||
|  | ||||
|     @property | ||||
|     def path(self) -> PurePath: | ||||
|         return self._path | ||||
|  | ||||
|     async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: | ||||
|         await self._stack.enter_async_context(self._limiter.limit_download()) | ||||
|         sink = await self._stack.enter_async_context(self._fs_token) | ||||
|         # The "Downloaded ..." message is printed in the output dir, not here | ||||
|         bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading", | ||||
|                                                          fmt_path(self._path))) | ||||
|  | ||||
|         return bar, sink | ||||
|  | ||||
|  | ||||
| class CrawlerSection(Section): | ||||
|     def type(self) -> str: | ||||
|         value = self.s.get("type") | ||||
|         if value is None: | ||||
|             self.missing_value("type") | ||||
|         return value | ||||
|  | ||||
|     def skip(self) -> bool: | ||||
|         return self.s.getboolean("skip", fallback=False) | ||||
|  | ||||
|     def output_dir(self, name: str) -> Path: | ||||
|         # TODO Use removeprefix() after switching to 3.9 | ||||
|         if name.startswith("crawl:"): | ||||
|             name = name[len("crawl:"):] | ||||
|         return Path(self.s.get("output_dir", name)).expanduser() | ||||
|  | ||||
|     def redownload(self) -> Redownload: | ||||
|         value = self.s.get("redownload", "never-smart") | ||||
|         try: | ||||
|             return Redownload.from_string(value) | ||||
|         except ValueError as e: | ||||
|             self.invalid_value( | ||||
|                 "redownload", | ||||
|                 value, | ||||
|                 str(e).capitalize(), | ||||
|             ) | ||||
|  | ||||
|     def on_conflict(self) -> OnConflict: | ||||
|         value = self.s.get("on_conflict", "prompt") | ||||
|         try: | ||||
|             return OnConflict.from_string(value) | ||||
|         except ValueError as e: | ||||
|             self.invalid_value( | ||||
|                 "on_conflict", | ||||
|                 value, | ||||
|                 str(e).capitalize(), | ||||
|             ) | ||||
|  | ||||
|     def transform(self) -> str: | ||||
|         return self.s.get("transform", "") | ||||
|  | ||||
|     def tasks(self) -> int: | ||||
|         value = self.s.getint("tasks", fallback=1) | ||||
|         if value <= 0: | ||||
|             self.invalid_value("tasks", value, "Must be greater than 0") | ||||
|         return value | ||||
|  | ||||
|     def downloads(self) -> int: | ||||
|         tasks = self.tasks() | ||||
|         value = self.s.getint("downloads", fallback=None) | ||||
|         if value is None: | ||||
|             return tasks | ||||
|         if value <= 0: | ||||
|             self.invalid_value("downloads", value, "Must be greater than 0") | ||||
|         if value > tasks: | ||||
|             self.invalid_value("downloads", value, "Must not be greater than tasks") | ||||
|         return value | ||||
|  | ||||
|     def task_delay(self) -> float: | ||||
|         value = self.s.getfloat("task_delay", fallback=0.0) | ||||
|         if value < 0: | ||||
|             self.invalid_value("task_delay", value, "Must not be negative") | ||||
|         return value | ||||
|  | ||||
|     def windows_paths(self) -> bool: | ||||
|         on_windows = os.name == "nt" | ||||
|         return self.s.getboolean("windows_paths", fallback=on_windows) | ||||
|  | ||||
|     def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: | ||||
|         value = self.s.get("auth") | ||||
|         if value is None: | ||||
|             self.missing_value("auth") | ||||
|         auth = authenticators.get(value) | ||||
|         if auth is None: | ||||
|             self.invalid_value("auth", value, "No such auth section exists") | ||||
|         return auth | ||||
|  | ||||
|  | ||||
| class Crawler(ABC): | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: CrawlerSection, | ||||
|             config: Config, | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Initialize a crawler from its name and its section in the config file. | ||||
|  | ||||
|         If you are writing your own constructor for your own crawler, make sure | ||||
|         to call this constructor first (via super().__init__). | ||||
|  | ||||
|         May throw a CrawlerLoadException. | ||||
|         """ | ||||
|  | ||||
|         self.name = name | ||||
|         self.error_free = True | ||||
|  | ||||
|         self._limiter = Limiter( | ||||
|             task_limit=section.tasks(), | ||||
|             download_limit=section.downloads(), | ||||
|             task_delay=section.task_delay(), | ||||
|         ) | ||||
|  | ||||
|         self._deduplicator = Deduplicator(section.windows_paths()) | ||||
|         self._transformer = Transformer(section.transform()) | ||||
|  | ||||
|         self._output_dir = OutputDirectory( | ||||
|             config.default_section.working_dir() / section.output_dir(name), | ||||
|             section.redownload(), | ||||
|             section.on_conflict(), | ||||
|         ) | ||||
|  | ||||
|     @property | ||||
|     def report(self) -> Report: | ||||
|         return self._output_dir.report | ||||
|  | ||||
|     @property | ||||
|     def prev_report(self) -> Optional[Report]: | ||||
|         return self._output_dir.prev_report | ||||
|  | ||||
|     @staticmethod | ||||
|     async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: | ||||
|         """ | ||||
|         Similar to asyncio.gather. However, in the case of an exception, all | ||||
|         still running tasks are cancelled and the exception is rethrown. | ||||
|  | ||||
|         This should always be preferred over asyncio.gather in crawler code so | ||||
|         that an exception like CrawlError may actually stop the crawler. | ||||
|         """ | ||||
|  | ||||
|         tasks = [asyncio.ensure_future(aw) for aw in awaitables] | ||||
|         result = asyncio.gather(*tasks) | ||||
|         try: | ||||
|             return await result | ||||
|         except:  # noqa: E722 | ||||
|             for task in tasks: | ||||
|                 task.cancel() | ||||
|             raise | ||||
|  | ||||
|     async def crawl(self, path: PurePath) -> Optional[CrawlToken]: | ||||
|         log.explain_topic(f"Decision: Crawl {fmt_path(path)}") | ||||
|         path = self._deduplicator.mark(path) | ||||
|         self._output_dir.report.found(path) | ||||
|  | ||||
|         if self._transformer.transform(path) is None: | ||||
|             log.explain("Answer: No") | ||||
|             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||
|             return None | ||||
|  | ||||
|         log.explain("Answer: Yes") | ||||
|         return CrawlToken(self._limiter, path) | ||||
|  | ||||
|     async def download( | ||||
|             self, | ||||
|             path: PurePath, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
|     ) -> Optional[DownloadToken]: | ||||
|         log.explain_topic(f"Decision: Download {fmt_path(path)}") | ||||
|         path = self._deduplicator.mark(path) | ||||
|         self._output_dir.report.found(path) | ||||
|  | ||||
|         transformed_path = self._transformer.transform(path) | ||||
|         if transformed_path is None: | ||||
|             log.explain("Answer: No") | ||||
|             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||
|             return None | ||||
|  | ||||
|         fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) | ||||
|         if fs_token is None: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|  | ||||
|         log.explain("Answer: Yes") | ||||
|         return DownloadToken(self._limiter, fs_token, path) | ||||
|  | ||||
|     async def _cleanup(self) -> None: | ||||
|         log.explain_topic("Decision: Clean up files") | ||||
|         if self.error_free: | ||||
|             log.explain("No warnings or errors occurred during this run") | ||||
|             log.explain("Answer: Yes") | ||||
|             await self._output_dir.cleanup() | ||||
|         else: | ||||
|             log.explain("Warnings or errors occurred during this run") | ||||
|             log.explain("Answer: No") | ||||
|  | ||||
|     @anoncritical | ||||
|     async def run(self) -> None: | ||||
|         """ | ||||
|         Start the crawling process. Call this function if you want to use a | ||||
|         crawler. | ||||
|         """ | ||||
|  | ||||
|         with log.show_progress(): | ||||
|             self._output_dir.prepare() | ||||
|             self._output_dir.load_prev_report() | ||||
|             await self._run() | ||||
|             await self._cleanup() | ||||
|             self._output_dir.store_report() | ||||
|  | ||||
|     @abstractmethod | ||||
|     async def _run(self) -> None: | ||||
|         """ | ||||
|         Overwrite this function if you are writing a crawler. | ||||
|  | ||||
|         This function must not return before all crawling is complete. To crawl | ||||
|         multiple things concurrently, asyncio.gather can be used. | ||||
|         """ | ||||
|  | ||||
|         pass | ||||
|  | ||||
|     def debug_transforms(self) -> None: | ||||
|         self._output_dir.load_prev_report() | ||||
|  | ||||
|         if not self.prev_report: | ||||
|             log.warn("Couldn't find or load old report") | ||||
|             return | ||||
|  | ||||
|         seen: Set[PurePath] = set() | ||||
|         for known in sorted(self.prev_report.found_paths): | ||||
|             looking_at = list(reversed(known.parents)) + [known] | ||||
|             for path in looking_at: | ||||
|                 if path in seen: | ||||
|                     continue | ||||
|  | ||||
|                 log.explain_topic(f"Transforming {fmt_path(path)}") | ||||
|                 self._transformer.transform(path) | ||||
|                 seen.add(path) | ||||
							
								
								
									
										199
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										199
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,199 @@ | ||||
| import asyncio | ||||
| import http.cookies | ||||
| import ssl | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Dict, List, Optional | ||||
|  | ||||
| import aiohttp | ||||
| import certifi | ||||
| from aiohttp.client import ClientTimeout | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config | ||||
| from ..logging import log | ||||
| from ..utils import fmt_real_path | ||||
| from ..version import NAME, VERSION | ||||
| from .crawler import Crawler, CrawlerSection | ||||
|  | ||||
|  | ||||
| class HttpCrawlerSection(CrawlerSection): | ||||
|     def http_timeout(self) -> float: | ||||
|         return self.s.getfloat("http_timeout", fallback=20) | ||||
|  | ||||
|  | ||||
| class HttpCrawler(Crawler): | ||||
|     COOKIE_FILE = PurePath(".cookies") | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: HttpCrawlerSection, | ||||
|             config: Config, | ||||
|             shared_auth: Optional[Authenticator] = None, | ||||
|     ) -> None: | ||||
|         super().__init__(name, section, config) | ||||
|  | ||||
|         self._authentication_id = 0 | ||||
|         self._authentication_lock = asyncio.Lock() | ||||
|         self._request_count = 0 | ||||
|         self._http_timeout = section.http_timeout() | ||||
|  | ||||
|         self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) | ||||
|         self._shared_cookie_jar_paths: Optional[List[Path]] = None | ||||
|         self._shared_auth = shared_auth | ||||
|  | ||||
|         self._output_dir.register_reserved(self.COOKIE_FILE) | ||||
|  | ||||
|     async def _current_auth_id(self) -> int: | ||||
|         """ | ||||
|         Returns the id for the current authentication, i.e. an identifier for the last | ||||
|         successful call to [authenticate]. | ||||
|  | ||||
|         This method must be called before any request that might authenticate is made, so the | ||||
|         HttpCrawler can properly track when [authenticate] can return early and when actual | ||||
|         authentication is necessary. | ||||
|         """ | ||||
|         # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. | ||||
|         # This should reduce the amount of requests we make: If an authentication is in progress | ||||
|         # all future requests wait for authentication to complete. | ||||
|         async with self._authentication_lock: | ||||
|             self._request_count += 1 | ||||
|             return self._authentication_id | ||||
|  | ||||
|     async def authenticate(self, caller_auth_id: int) -> None: | ||||
|         """ | ||||
|         Starts the authentication process. The main work is offloaded to _authenticate, which | ||||
|         you should overwrite in a subclass if needed. This method should *NOT* be overwritten. | ||||
|  | ||||
|         The [caller_auth_id] should be the result of a [_current_auth_id] call made *before* | ||||
|         the request was made. This ensures that authentication is not performed needlessly. | ||||
|         """ | ||||
|         async with self._authentication_lock: | ||||
|             log.explain_topic("Authenticating") | ||||
|             # Another thread successfully called authenticate in-between | ||||
|             # We do not want to perform auth again, so we return here. We can | ||||
|             # assume the other thread suceeded as authenticate will throw an error | ||||
|             # if it failed and aborts the crawl process. | ||||
|             if caller_auth_id != self._authentication_id: | ||||
|                 log.explain( | ||||
|                     "Authentication skipped due to auth id mismatch." | ||||
|                     "A previous authentication beat us to the race." | ||||
|                 ) | ||||
|                 return | ||||
|             log.explain("Calling crawler-specific authenticate") | ||||
|             await self._authenticate() | ||||
|             self._authentication_id += 1 | ||||
|             # Saving the cookies after the first auth ensures we won't need to re-authenticate | ||||
|             # on the next run, should this one be aborted or crash | ||||
|             self._save_cookies() | ||||
|  | ||||
|     async def _authenticate(self) -> None: | ||||
|         """ | ||||
|         Performs authentication. This method must only return normally if authentication suceeded. | ||||
|         In all other cases it must either retry internally or throw a terminal exception. | ||||
|         """ | ||||
|         raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") | ||||
|  | ||||
|     def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None: | ||||
|         if not self._shared_auth: | ||||
|             return | ||||
|  | ||||
|         if self._shared_auth in shared: | ||||
|             self._shared_cookie_jar_paths = shared[self._shared_auth] | ||||
|         else: | ||||
|             self._shared_cookie_jar_paths = [] | ||||
|             shared[self._shared_auth] = self._shared_cookie_jar_paths | ||||
|  | ||||
|         self._shared_cookie_jar_paths.append(self._cookie_jar_path) | ||||
|  | ||||
|     def _load_cookies_from_file(self, path: Path) -> None: | ||||
|         jar: Any = http.cookies.SimpleCookie() | ||||
|         with open(path, encoding="utf-8") as f: | ||||
|             for i, line in enumerate(f): | ||||
|                 # Names of headers are case insensitive | ||||
|                 if line[:11].lower() == "set-cookie:": | ||||
|                     jar.load(line[11:]) | ||||
|                 else: | ||||
|                     log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it") | ||||
|         self._cookie_jar.update_cookies(jar) | ||||
|  | ||||
|     def _save_cookies_to_file(self, path: Path) -> None: | ||||
|         jar: Any = http.cookies.SimpleCookie() | ||||
|         for morsel in self._cookie_jar: | ||||
|             jar[morsel.key] = morsel | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             f.write(jar.output(sep="\n")) | ||||
|             f.write("\n")  # A trailing newline is just common courtesy | ||||
|  | ||||
|     def _load_cookies(self) -> None: | ||||
|         log.explain_topic("Loading cookies") | ||||
|  | ||||
|         cookie_jar_path: Optional[Path] = None | ||||
|  | ||||
|         if self._shared_cookie_jar_paths is None: | ||||
|             log.explain("Not sharing any cookies") | ||||
|             cookie_jar_path = self._cookie_jar_path | ||||
|         else: | ||||
|             log.explain("Sharing cookies") | ||||
|             max_mtime: Optional[float] = None | ||||
|             for path in self._shared_cookie_jar_paths: | ||||
|                 if not path.is_file(): | ||||
|                     log.explain(f"{fmt_real_path(path)} is not a file") | ||||
|                     continue | ||||
|                 mtime = path.stat().st_mtime | ||||
|                 if max_mtime is None or mtime > max_mtime: | ||||
|                     log.explain(f"{fmt_real_path(path)} has newest mtime so far") | ||||
|                     max_mtime = mtime | ||||
|                     cookie_jar_path = path | ||||
|                 else: | ||||
|                     log.explain(f"{fmt_real_path(path)} has older mtime") | ||||
|  | ||||
|         if cookie_jar_path is None: | ||||
|             log.explain("Couldn't find a suitable cookie file") | ||||
|             return | ||||
|  | ||||
|         log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") | ||||
|         try: | ||||
|             self._load_cookies_from_file(cookie_jar_path) | ||||
|         except Exception as e: | ||||
|             log.explain("Failed to load cookies") | ||||
|             log.explain(str(e)) | ||||
|  | ||||
|     def _save_cookies(self) -> None: | ||||
|         log.explain_topic("Saving cookies") | ||||
|  | ||||
|         try: | ||||
|             log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||
|             self._save_cookies_to_file(self._cookie_jar_path) | ||||
|         except Exception as e: | ||||
|             log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||
|             log.warn(str(e)) | ||||
|  | ||||
|     async def run(self) -> None: | ||||
|         self._request_count = 0 | ||||
|         self._cookie_jar = aiohttp.CookieJar() | ||||
|         self._load_cookies() | ||||
|  | ||||
|         async with aiohttp.ClientSession( | ||||
|                 headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||
|                 cookie_jar=self._cookie_jar, | ||||
|                 connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||
|                 timeout=ClientTimeout( | ||||
|                     # 30 minutes. No download in the history of downloads was longer than 30 minutes. | ||||
|                     # This is enough to transfer a 600 MB file over a 3 Mib/s connection. | ||||
|                     # Allowing an arbitrary value could be annoying for overnight batch jobs | ||||
|                     total=15 * 60, | ||||
|                     connect=self._http_timeout, | ||||
|                     sock_connect=self._http_timeout, | ||||
|                     sock_read=self._http_timeout, | ||||
|                 ) | ||||
|         ) as session: | ||||
|             self.session = session | ||||
|             try: | ||||
|                 await super().run() | ||||
|             finally: | ||||
|                 del self.session | ||||
|         log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") | ||||
|  | ||||
|         # They are saved in authenticate, but a final save won't hurt | ||||
|         self._save_cookies() | ||||
							
								
								
									
										3
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .kit_ilias_web_crawler import KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
|  | ||||
| __all__ = ["KitIliasWebCrawler", "KitIliasWebCrawlerSection"] | ||||
							
								
								
									
										132
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,132 @@ | ||||
| from enum import Enum | ||||
| from typing import Optional | ||||
|  | ||||
| _link_template_plain = "{{link}}" | ||||
| _link_template_fancy = """ | ||||
| <!DOCTYPE html> | ||||
| <html lang="en"> | ||||
|     <head> | ||||
|         <meta charset="UTF-8"> | ||||
|         <title>ILIAS - Link: {{name}}</title> | ||||
|         <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> | ||||
|     </head> | ||||
|  | ||||
|     <style> | ||||
|     * { | ||||
|         box-sizing: border-box; | ||||
|     } | ||||
|     .center-flex { | ||||
|         display: flex; | ||||
|         align-items: center; | ||||
|         justify-content: center; | ||||
|     } | ||||
|     body { | ||||
|         padding: 0; | ||||
|         margin: 0; | ||||
|         background-color: #f0f0f0; | ||||
|         font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; | ||||
|         height: 100vh; | ||||
|     } | ||||
|     .row { | ||||
|         background-color: white; | ||||
|         min-width: 500px; | ||||
|         max-width: 90vw; | ||||
|         display: flex; | ||||
|         padding: 1em; | ||||
|     } | ||||
|     .logo { | ||||
|         flex: 0 1; | ||||
|         margin-right: 1em; | ||||
|         fill: #009682; | ||||
|     } | ||||
|     .tile { | ||||
|         flex: 1 0; | ||||
|         display: flex; | ||||
|         flex-direction: column; | ||||
|         justify-content: center; | ||||
|     } | ||||
|     .top-row { | ||||
|         padding-bottom: 5px; | ||||
|         font-size: 15px; | ||||
|     } | ||||
|     a { | ||||
|         color: #009682; | ||||
|         text-decoration: none; | ||||
|     } | ||||
|     a:hover { | ||||
|         text-decoration: underline; | ||||
|     } | ||||
|     .bottom-row { | ||||
|         font-size: 13px; | ||||
|     } | ||||
|     .menu-button { | ||||
|         border: 1px solid black; | ||||
|         margin-left: 4em; | ||||
|         width: 25px; | ||||
|         height: 25px; | ||||
|         flex: 0 0 25px; | ||||
|         background-color: #b3e0da; | ||||
|         font-size: 13px; | ||||
|         color: #222; | ||||
|     } | ||||
|     </style> | ||||
|     <body class="center-flex"> | ||||
|         <div class="row"> | ||||
|             <div class="logo center-flex"> | ||||
|                 <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> | ||||
|                     <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/> | ||||
|                 </svg> | ||||
|             </div> | ||||
|             <div class="tile"> | ||||
|                 <div class="top-row"> | ||||
|                     <a href="{{link}}">{{name}}</a> | ||||
|                 </div> | ||||
|                 <div class="bottom-row">{{description}}</div> | ||||
|             </div> | ||||
|             <div class="menu-button center-flex"> ⯆ </div> | ||||
|         </div> | ||||
|     </body> | ||||
| </html> | ||||
| """.strip()  # noqa: E501 line too long | ||||
|  | ||||
| _link_template_internet_shortcut = """ | ||||
| [InternetShortcut] | ||||
| URL={{link}} | ||||
| """.strip() | ||||
|  | ||||
|  | ||||
| class Links(Enum): | ||||
|     IGNORE = "ignore" | ||||
|     PLAINTEXT = "plaintext" | ||||
|     FANCY = "fancy" | ||||
|     INTERNET_SHORTCUT = "internet-shortcut" | ||||
|  | ||||
|     def template(self) -> Optional[str]: | ||||
|         if self == self.FANCY: | ||||
|             return _link_template_fancy | ||||
|         elif self == self.PLAINTEXT: | ||||
|             return _link_template_plain | ||||
|         elif self == self.INTERNET_SHORTCUT: | ||||
|             return _link_template_internet_shortcut | ||||
|         elif self == self.IGNORE: | ||||
|             return None | ||||
|         raise ValueError("Missing switch case") | ||||
|  | ||||
|     def extension(self) -> Optional[str]: | ||||
|         if self == self.FANCY: | ||||
|             return ".html" | ||||
|         elif self == self.PLAINTEXT: | ||||
|             return ".txt" | ||||
|         elif self == self.INTERNET_SHORTCUT: | ||||
|             return ".url" | ||||
|         elif self == self.IGNORE: | ||||
|             return None | ||||
|         raise ValueError("Missing switch case") | ||||
|  | ||||
|     @staticmethod | ||||
|     def from_string(string: str) -> "Links": | ||||
|         try: | ||||
|             return Links(string) | ||||
|         except ValueError: | ||||
|             raise ValueError("must be one of 'ignore', 'plaintext'," | ||||
|                              " 'html', 'internet-shortcut'") | ||||
							
								
								
									
										91
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,91 @@ | ||||
| from bs4 import BeautifulSoup, Comment, Tag | ||||
|  | ||||
| _STYLE_TAG_CONTENT = """ | ||||
|     .ilc_text_block_Information { | ||||
|       background-color: #f5f7fa; | ||||
|     } | ||||
|     div.ilc_text_block_Standard { | ||||
|       margin-bottom: 10px; | ||||
|       margin-top: 10px; | ||||
|     } | ||||
|     span.ilc_text_inline_Strong { | ||||
|       font-weight: bold; | ||||
|     } | ||||
|  | ||||
|     .accordion-head { | ||||
|       background-color: #f5f7fa; | ||||
|       padding: 0.5rem 0; | ||||
|     } | ||||
|  | ||||
|     h3 { | ||||
|       margin-top: 0.5rem; | ||||
|       margin-bottom: 1rem; | ||||
|     } | ||||
|  | ||||
|     br.visible-break { | ||||
|       margin-bottom: 1rem; | ||||
|     } | ||||
|  | ||||
|     article { | ||||
|       margin: 0.5rem 0; | ||||
|     } | ||||
|  | ||||
|     body { | ||||
|       padding: 1em; | ||||
|       grid-template-columns: 1fr min(60rem, 90%) 1fr; | ||||
|       line-height: 1.2; | ||||
|     } | ||||
| """ | ||||
|  | ||||
| _ARTICLE_WORTHY_CLASSES = [ | ||||
|     "ilc_text_block_Information", | ||||
|     "ilc_section_Attention", | ||||
|     "ilc_section_Link", | ||||
| ] | ||||
|  | ||||
|  | ||||
| def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|     head = soup.new_tag("head") | ||||
|     soup.insert(0, head) | ||||
|  | ||||
|     simplecss_link: Tag = soup.new_tag("link") | ||||
|     # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> | ||||
|     simplecss_link["rel"] = "stylesheet" | ||||
|     simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" | ||||
|     head.append(simplecss_link) | ||||
|  | ||||
|     # Basic style tags for compat | ||||
|     style: Tag = soup.new_tag("style") | ||||
|     style.append(_STYLE_TAG_CONTENT) | ||||
|     head.append(style) | ||||
|  | ||||
|     return soup | ||||
|  | ||||
|  | ||||
| def clean(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|     for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): | ||||
|         block.name = "article" | ||||
|  | ||||
|     for block in soup.find_all("h3"): | ||||
|         block.name = "div" | ||||
|  | ||||
|     for block in soup.find_all("h1"): | ||||
|         block.name = "h3" | ||||
|  | ||||
|     for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): | ||||
|         block.name = "h3" | ||||
|         block["class"] += ["accordion-head"] | ||||
|  | ||||
|     for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): | ||||
|         children = list(dummy.children) | ||||
|         if not children: | ||||
|             dummy.decompose() | ||||
|         if len(children) > 1: | ||||
|             continue | ||||
|         if type(children[0]) == Comment: | ||||
|             dummy.decompose() | ||||
|  | ||||
|     for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): | ||||
|         hrule_imposter.insert(0, soup.new_tag("hr")) | ||||
|  | ||||
|     return soup | ||||
							
								
								
									
										997
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										997
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,997 @@ | ||||
| import json | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from datetime import date, datetime, timedelta | ||||
| from enum import Enum | ||||
| from typing import Dict, List, Optional, Union | ||||
| from urllib.parse import urljoin, urlparse | ||||
|  | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from PFERD.logging import log | ||||
| from PFERD.utils import url_set_query_params | ||||
|  | ||||
| TargetType = Union[str, int] | ||||
|  | ||||
|  | ||||
| class IliasElementType(Enum): | ||||
|     EXERCISE = "exercise" | ||||
|     EXERCISE_FILES = "exercise_files"  # own submitted files | ||||
|     TEST = "test"                      # an online test. Will be ignored currently. | ||||
|     FILE = "file" | ||||
|     FOLDER = "folder" | ||||
|     FORUM = "forum" | ||||
|     LINK = "link" | ||||
|     BOOKING = "booking" | ||||
|     MEETING = "meeting" | ||||
|     VIDEO = "video" | ||||
|     VIDEO_PLAYER = "video_player" | ||||
|     VIDEO_FOLDER = "video_folder" | ||||
|     VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasPageElement: | ||||
|     type: IliasElementType | ||||
|     url: str | ||||
|     name: str | ||||
|     mtime: Optional[datetime] = None | ||||
|     description: Optional[str] = None | ||||
|  | ||||
|     def id(self) -> str: | ||||
|         regexes = [ | ||||
|             r"eid=(?P<id>[0-9a-z\-]+)", | ||||
|             r"file_(?P<id>\d+)", | ||||
|             r"ref_id=(?P<id>\d+)", | ||||
|             r"target=[a-z]+_(?P<id>\d+)" | ||||
|         ] | ||||
|  | ||||
|         for regex in regexes: | ||||
|             if match := re.search(regex, self.url): | ||||
|                 return match.groupdict()["id"] | ||||
|  | ||||
|         # Fall back to URL | ||||
|         log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") | ||||
|         return self.url | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasDownloadForumData: | ||||
|     url: str | ||||
|     form_data: Dict[str, Union[str, List[str]]] | ||||
|     empty: bool | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasForumThread: | ||||
|     title: str | ||||
|     title_tag: Tag | ||||
|     content_tag: Tag | ||||
|     mtime: Optional[datetime] | ||||
|  | ||||
|  | ||||
| class IliasPage: | ||||
|  | ||||
|     def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): | ||||
|         self._soup = soup | ||||
|         self._page_url = _page_url | ||||
|         self._page_type = source_element.type if source_element else None | ||||
|         self._source_name = source_element.name if source_element else "" | ||||
|  | ||||
|     def get_child_elements(self) -> List[IliasPageElement]: | ||||
|         """ | ||||
|         Return all child page elements you can find here. | ||||
|         """ | ||||
|         if self._is_video_player(): | ||||
|             log.explain("Page is a video player, extracting URL") | ||||
|             return self._player_to_video() | ||||
|         if self._is_video_listing(): | ||||
|             log.explain("Page is a video listing, searching for elements") | ||||
|             return self._find_video_entries() | ||||
|         if self._is_exercise_file(): | ||||
|             log.explain("Page is an exercise, searching for elements") | ||||
|             return self._find_exercise_entries() | ||||
|         if self._is_personal_desktop(): | ||||
|             log.explain("Page is the personal desktop, searching for elements") | ||||
|             return self._find_personal_desktop_entries() | ||||
|         if self._is_content_page(): | ||||
|             log.explain("Page is a content page, searching for elements") | ||||
|             return self._find_copa_entries() | ||||
|         log.explain("Page is a normal folder, searching for elements") | ||||
|         return self._find_normal_entries() | ||||
|  | ||||
|     def get_description(self) -> Optional[BeautifulSoup]: | ||||
|         def is_interesting_class(name: str) -> bool: | ||||
|             return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] | ||||
|  | ||||
|         paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) | ||||
|         if not paragraphs: | ||||
|             return None | ||||
|  | ||||
|         # Extract bits and pieces into a string and parse it again. | ||||
|         # This ensures we don't miss anything and weird structures are resolved | ||||
|         # somewhat gracefully. | ||||
|         raw_html = "" | ||||
|         for p in paragraphs: | ||||
|             if p.find_parent(class_=is_interesting_class): | ||||
|                 continue | ||||
|  | ||||
|             # Ignore special listings (like folder groupings) | ||||
|             if "ilc_section_Special" in p["class"]: | ||||
|                 continue | ||||
|  | ||||
|             raw_html += str(p) + "\n" | ||||
|         raw_html = f"<body>\n{raw_html}\n</body>" | ||||
|  | ||||
|         return BeautifulSoup(raw_html, "html.parser") | ||||
|  | ||||
|     def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: | ||||
|         form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) | ||||
|         if not form: | ||||
|             return None | ||||
|         post_url = self._abs_url_from_relative(form["action"]) | ||||
|  | ||||
|         thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] | ||||
|  | ||||
|         form_data: Dict[str, Union[str, List[ſtr]]] = { | ||||
|             "thread_ids[]": thread_ids, | ||||
|             "selected_cmd2": "html", | ||||
|             "select_cmd2": "Ausführen", | ||||
|             "selected_cmd": "", | ||||
|         } | ||||
|  | ||||
|         return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) | ||||
|  | ||||
|     def get_next_stage_element(self) -> Optional[IliasPageElement]: | ||||
|         if self._is_forum_page(): | ||||
|             if "trows=800" in self._page_url: | ||||
|                 return None | ||||
|             log.explain("Requesting *all* forum threads") | ||||
|             return self._get_show_max_forum_entries_per_page_url() | ||||
|         if self._is_ilias_opencast_embedding(): | ||||
|             log.explain("Unwrapping opencast embedding") | ||||
|             return self.get_child_elements()[0] | ||||
|         if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: | ||||
|             log.explain("Unwrapping video pagination") | ||||
|             return self._find_video_entries_paginated()[0] | ||||
|         if self._contains_collapsed_future_meetings(): | ||||
|             log.explain("Requesting *all* future meetings") | ||||
|             return self._uncollapse_future_meetings_url() | ||||
|         return None | ||||
|  | ||||
|     def _is_forum_page(self) -> bool: | ||||
|         read_more_btn = self._soup.find( | ||||
|             "button", | ||||
|             attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} | ||||
|         ) | ||||
|         return read_more_btn is not None | ||||
|  | ||||
|     def _is_video_player(self) -> bool: | ||||
|         return "paella_config_file" in str(self._soup) | ||||
|  | ||||
|     def _is_video_listing(self) -> bool: | ||||
|         if self._is_ilias_opencast_embedding(): | ||||
|             return True | ||||
|  | ||||
|         # Raw listing without ILIAS fluff | ||||
|         video_element_table: Tag = self._soup.find( | ||||
|             name="table", id=re.compile(r"tbl_xoct_.+") | ||||
|         ) | ||||
|         return video_element_table is not None | ||||
|  | ||||
|     def _is_ilias_opencast_embedding(self) -> bool: | ||||
|         # ILIAS fluff around the real opencast html | ||||
|         if self._soup.find(id="headerimage"): | ||||
|             element: Tag = self._soup.find(id="headerimage") | ||||
|             if "opencast" in element.attrs["src"].lower(): | ||||
|                 return True | ||||
|         return False | ||||
|  | ||||
|     def _is_exercise_file(self) -> bool: | ||||
|         # we know it from before | ||||
|         if self._page_type == IliasElementType.EXERCISE: | ||||
|             return True | ||||
|  | ||||
|         # We have no suitable parent - let's guesss | ||||
|         if self._soup.find(id="headerimage"): | ||||
|             element: Tag = self._soup.find(id="headerimage") | ||||
|             if "exc" in element.attrs["src"].lower(): | ||||
|                 return True | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _is_personal_desktop(self) -> bool: | ||||
|         return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) | ||||
|  | ||||
|     def _is_content_page(self) -> bool: | ||||
|         link = self._soup.find(id="current_perma_link") | ||||
|         if not link: | ||||
|             return False | ||||
|         return "target=copa_" in link.get("value") | ||||
|  | ||||
|     def _contains_collapsed_future_meetings(self) -> bool: | ||||
|         return self._uncollapse_future_meetings_url() is not None | ||||
|  | ||||
|     def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: | ||||
|         element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) | ||||
|         if not element: | ||||
|             return None | ||||
|         link = self._abs_url_from_link(element) | ||||
|         return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") | ||||
|  | ||||
|     def _player_to_video(self) -> List[IliasPageElement]: | ||||
|         # Fetch the actual video page. This is a small wrapper page initializing a javscript | ||||
|         # player. Sadly we can not execute that JS. The actual video stream url is nowhere | ||||
|         # on the page, but defined in a JS object inside a script tag, passed to the player | ||||
|         # library. | ||||
|         # We do the impossible and RegEx the stream JSON object out of the page's HTML source | ||||
|         regex = re.compile( | ||||
|             r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE | ||||
|         ) | ||||
|         json_match = regex.search(str(self._soup)) | ||||
|  | ||||
|         if json_match is None: | ||||
|             log.warn("Could not find JSON stream info in video player. Ignoring video.") | ||||
|             return [] | ||||
|         json_str = json_match.group(1) | ||||
|  | ||||
|         # parse it | ||||
|         json_object = json.loads(json_str) | ||||
|         streams = [stream for stream in json_object["streams"]] | ||||
|  | ||||
|         # and just fetch the lone video url! | ||||
|         if len(streams) == 1: | ||||
|             video_url = streams[0]["sources"]["mp4"][0]["src"] | ||||
|             return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] | ||||
|  | ||||
|         log.explain(f"Found multiple videos for stream at {self._source_name}") | ||||
|         items = [] | ||||
|         for stream in sorted(streams, key=lambda stream: stream["content"]): | ||||
|             full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" | ||||
|             video_url = stream["sources"]["mp4"][0]["src"] | ||||
|             items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: | ||||
|         correct_link = self._soup.find( | ||||
|             "a", | ||||
|             attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} | ||||
|         ) | ||||
|  | ||||
|         if not correct_link: | ||||
|             return None | ||||
|  | ||||
|         link = self._abs_url_from_link(correct_link) | ||||
|  | ||||
|         return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") | ||||
|  | ||||
|     def _find_personal_desktop_entries(self) -> List[IliasPageElement]: | ||||
|         items: List[IliasPageElement] = [] | ||||
|  | ||||
|         titles: List[Tag] = self._soup.select(".il-item-title") | ||||
|         for title in titles: | ||||
|             link = title.find("a") | ||||
|             name = _sanitize_path_name(link.text.strip()) | ||||
|             url = self._abs_url_from_link(link) | ||||
|  | ||||
|             type = self._find_type_from_link(name, link, url) | ||||
|             if not type: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not extract type for {link}") | ||||
|                 continue | ||||
|  | ||||
|             log.explain(f"Found {name!r}") | ||||
|  | ||||
|             if type == IliasElementType.FILE and "_download" not in url: | ||||
|                 url = re.sub(r"(target=file_\d+)", r"\1_download", url) | ||||
|                 log.explain("Rewired file URL to include download part") | ||||
|  | ||||
|             items.append(IliasPageElement(type, url, name)) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _find_copa_entries(self) -> List[IliasPageElement]: | ||||
|         items: List[IliasPageElement] = [] | ||||
|         links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") | ||||
|  | ||||
|         for link in links: | ||||
|             url = self._abs_url_from_link(link) | ||||
|             name = _sanitize_path_name(link.getText().strip().replace("\t", "")) | ||||
|  | ||||
|             if "file_id" not in url: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}") | ||||
|                 continue | ||||
|  | ||||
|             items.append(IliasPageElement(IliasElementType.FILE, url, name)) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _find_video_entries(self) -> List[IliasPageElement]: | ||||
|         # ILIAS has three stages for video pages | ||||
|         # 1. The initial dummy page without any videos. This page contains the link to the listing | ||||
|         # 2. The video listing which might be paginated | ||||
|         # 3. An unpaginated video listing (or at least one that includes 800 videos) | ||||
|         # | ||||
|         # We need to figure out where we are. | ||||
|  | ||||
|         video_element_table: Tag = self._soup.find( | ||||
|             name="table", id=re.compile(r"tbl_xoct_.+") | ||||
|         ) | ||||
|  | ||||
|         if video_element_table is None: | ||||
|             # We are in stage 1 | ||||
|             # The page is actually emtpy but contains the link to stage 2 | ||||
|             content_link: Tag = self._soup.select_one("#tab_series a") | ||||
|             url: str = self._abs_url_from_link(content_link) | ||||
|             query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||
|             url = url_set_query_params(url, query_params) | ||||
|             log.explain("Found ILIAS video frame page, fetching actual content next") | ||||
|             return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] | ||||
|  | ||||
|         is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None | ||||
|  | ||||
|         if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: | ||||
|             # We are in stage 2 - try to break pagination | ||||
|             return self._find_video_entries_paginated() | ||||
|  | ||||
|         return self._find_video_entries_no_paging() | ||||
|  | ||||
|     def _find_video_entries_paginated(self) -> List[IliasPageElement]: | ||||
|         table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) | ||||
|  | ||||
|         if table_element is None: | ||||
|             log.warn("Couldn't increase elements per page (table not found). I might miss elements.") | ||||
|             return self._find_video_entries_no_paging() | ||||
|  | ||||
|         id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) | ||||
|         if id_match is None: | ||||
|             log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") | ||||
|             return self._find_video_entries_no_paging() | ||||
|  | ||||
|         table_id = id_match.group(1) | ||||
|  | ||||
|         query_params = {f"tbl_xoct_{table_id}_trows": "800", | ||||
|                         "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||
|         url = url_set_query_params(self._page_url, query_params) | ||||
|  | ||||
|         log.explain("Disabled pagination, retrying folder as a new entry") | ||||
|         return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] | ||||
|  | ||||
|     def _find_video_entries_no_paging(self) -> List[IliasPageElement]: | ||||
|         """ | ||||
|         Crawls the "second stage" video page. This page contains the actual video urls. | ||||
|         """ | ||||
|         # Video start links are marked with an "Abspielen" link | ||||
|         video_links: List[Tag] = self._soup.findAll( | ||||
|             name="a", text=re.compile(r"\s*Abspielen\s*") | ||||
|         ) | ||||
|  | ||||
|         results: List[IliasPageElement] = [] | ||||
|  | ||||
|         for link in video_links: | ||||
|             results.append(self._listed_video_to_element(link)) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _listed_video_to_element(self, link: Tag) -> IliasPageElement: | ||||
|         # The link is part of a table with multiple columns, describing metadata. | ||||
|         # 6th or 7th child (1 indexed) is the modification time string. Try to find it | ||||
|         # by parsing backwards from the end and finding something that looks like a date | ||||
|         modification_time = None | ||||
|         row: Tag = link.parent.parent.parent | ||||
|         column_count = len(row.select("td.std")) | ||||
|         for index in range(column_count, 0, -1): | ||||
|             modification_string = link.parent.parent.parent.select_one( | ||||
|                 f"td.std:nth-child({index})" | ||||
|             ).getText().strip() | ||||
|             if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): | ||||
|                 modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") | ||||
|                 break | ||||
|  | ||||
|         if modification_time is None: | ||||
|             log.warn(f"Could not determine upload time for {link}") | ||||
|             modification_time = datetime.now() | ||||
|  | ||||
|         title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() | ||||
|         title += ".mp4" | ||||
|  | ||||
|         video_name: str = _sanitize_path_name(title) | ||||
|  | ||||
|         video_url = self._abs_url_from_link(link) | ||||
|  | ||||
|         log.explain(f"Found video {video_name!r} at {video_url}") | ||||
|         return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) | ||||
|  | ||||
|     def _find_exercise_entries(self) -> List[IliasPageElement]: | ||||
|         if self._soup.find(id="tab_submission"): | ||||
|             log.explain("Found submission tab. This is an exercise detail page") | ||||
|             return self._find_exercise_entries_detail_page() | ||||
|         log.explain("Found no submission tab. This is an exercise root page") | ||||
|         return self._find_exercise_entries_root_page() | ||||
|  | ||||
|     def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: | ||||
|         results: List[IliasPageElement] = [] | ||||
|  | ||||
|         # Find all download links in the container (this will contain all the files) | ||||
|         download_links: List[Tag] = self._soup.findAll( | ||||
|             name="a", | ||||
|             # download links contain the given command class | ||||
|             attrs={"href": lambda x: x and "cmd=download" in x}, | ||||
|             text="Download" | ||||
|         ) | ||||
|  | ||||
|         for link in download_links: | ||||
|             parent_row: Tag = link.findParent("tr") | ||||
|             children: List[Tag] = parent_row.findChildren("td") | ||||
|  | ||||
|             name = _sanitize_path_name(children[1].getText().strip()) | ||||
|             log.explain(f"Found exercise detail entry {name!r}") | ||||
|  | ||||
|             for child in reversed(children): | ||||
|                 date = demangle_date(child.getText().strip(), fail_silently=True) | ||||
|                 if date is not None: | ||||
|                     break | ||||
|             if date is None: | ||||
|                 log.warn(f"Date parsing failed for exercise entry {name!r}") | ||||
|  | ||||
|             results.append(IliasPageElement( | ||||
|                 IliasElementType.FILE, | ||||
|                 self._abs_url_from_link(link), | ||||
|                 name, | ||||
|                 date | ||||
|             )) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _find_exercise_entries_root_page(self) -> List[IliasPageElement]: | ||||
|         results: List[IliasPageElement] = [] | ||||
|  | ||||
|         # Each assignment is in an accordion container | ||||
|         assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") | ||||
|  | ||||
|         for container in assignment_containers: | ||||
|             # Fetch the container name out of the header to use it in the path | ||||
|             container_name = container.select_one(".ilAssignmentHeader").getText().strip() | ||||
|             log.explain(f"Found exercise container {container_name!r}") | ||||
|  | ||||
|             # Find all download links in the container (this will contain all the files) | ||||
|             files: List[Tag] = container.findAll( | ||||
|                 name="a", | ||||
|                 # download links contain the given command class | ||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, | ||||
|                 text="Download" | ||||
|             ) | ||||
|  | ||||
|             # Grab each file as you now have the link | ||||
|             for file_link in files: | ||||
|                 # Two divs, side by side. Left is the name, right is the link ==> get left | ||||
|                 # sibling | ||||
|                 file_name = file_link.parent.findPrevious(name="div").getText().strip() | ||||
|                 file_name = _sanitize_path_name(file_name) | ||||
|                 url = self._abs_url_from_link(file_link) | ||||
|  | ||||
|                 log.explain(f"Found exercise entry {file_name!r}") | ||||
|                 results.append(IliasPageElement( | ||||
|                     IliasElementType.FILE, | ||||
|                     url, | ||||
|                     container_name + "/" + file_name, | ||||
|                     None  # We do not have any timestamp | ||||
|                 )) | ||||
|  | ||||
|             # Find all links to file listings (e.g. "Submitted Files" for groups) | ||||
|             file_listings: List[Tag] = container.findAll( | ||||
|                 name="a", | ||||
|                 # download links contain the given command class | ||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} | ||||
|             ) | ||||
|  | ||||
|             # Add each listing as a new | ||||
|             for listing in file_listings: | ||||
|                 parent_container: Tag = listing.findParent( | ||||
|                     "div", attrs={"class": lambda x: x and "form-group" in x} | ||||
|                 ) | ||||
|                 label_container: Tag = parent_container.find( | ||||
|                     attrs={"class": lambda x: x and "control-label" in x} | ||||
|                 ) | ||||
|                 file_name = _sanitize_path_name(label_container.getText().strip()) | ||||
|                 url = self._abs_url_from_link(listing) | ||||
|                 log.explain(f"Found exercise detail {file_name!r} at {url}") | ||||
|                 results.append(IliasPageElement( | ||||
|                     IliasElementType.EXERCISE_FILES, | ||||
|                     url, | ||||
|                     container_name + "/" + file_name, | ||||
|                     None  # we do not have any timestamp | ||||
|                 )) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _find_normal_entries(self) -> List[IliasPageElement]: | ||||
|         result: List[IliasPageElement] = [] | ||||
|  | ||||
|         # Fetch all links and throw them to the general interpreter | ||||
|         links: List[Tag] = self._soup.select("a.il_ContainerItemTitle") | ||||
|  | ||||
|         for link in links: | ||||
|             abs_url = self._abs_url_from_link(link) | ||||
|             parents = self._find_upwards_folder_hierarchy(link) | ||||
|  | ||||
|             if parents: | ||||
|                 element_name = "/".join(parents) + "/" + _sanitize_path_name(link.getText()) | ||||
|             else: | ||||
|                 element_name = _sanitize_path_name(link.getText()) | ||||
|  | ||||
|             element_type = self._find_type_from_link(element_name, link, abs_url) | ||||
|             description = self._find_link_description(link) | ||||
|  | ||||
|             # The last meeting on every page is expanded by default. | ||||
|             # Its content is then shown inline *and* in the meeting page itself. | ||||
|             # We should skip the inline content. | ||||
|             if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(link): | ||||
|                 continue | ||||
|  | ||||
|             if not element_type: | ||||
|                 continue | ||||
|             if element_type == IliasElementType.MEETING: | ||||
|                 normalized = _sanitize_path_name(self._normalize_meeting_name(element_name)) | ||||
|                 log.explain(f"Normalized meeting name from {element_name!r} to {normalized!r}") | ||||
|                 element_name = normalized | ||||
|             elif element_type == IliasElementType.FILE: | ||||
|                 result.append(self._file_to_element(element_name, abs_url, link)) | ||||
|                 continue | ||||
|  | ||||
|             log.explain(f"Found {element_name!r}") | ||||
|             result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) | ||||
|  | ||||
|         result += self._find_cards() | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _is_in_expanded_meeting(self, tag: Tag) -> bool: | ||||
|         """ | ||||
|         Returns whether a file is part of an expanded meeting. | ||||
|         Has false positives for meetings themselves as their title is also "in the expanded meeting content". | ||||
|         It is in the same general div and this whole thing is guesswork. | ||||
|         Therefore, you should check for meetings before passing them in this function. | ||||
|         """ | ||||
|         parents: List[Tag] = list(tag.parents) | ||||
|         for parent in parents: | ||||
|             if not parent.get("class"): | ||||
|                 continue | ||||
|  | ||||
|             # We should not crawl files under meetings | ||||
|             if "ilContainerListItemContentCB" in parent.get("class"): | ||||
|                 link: Tag = parent.parent.find("a") | ||||
|                 type = IliasPage._find_type_from_folder_like(link, self._page_url) | ||||
|                 return type == IliasElementType.MEETING | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: | ||||
|         """ | ||||
|         Interprets accordions and expandable blocks as virtual folders and returns them | ||||
|         in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test" | ||||
|         """ | ||||
|         found_titles = [] | ||||
|  | ||||
|         outer_accordion_content: Optional[Tag] = None | ||||
|  | ||||
|         parents: List[Tag] = list(tag.parents) | ||||
|         for parent in parents: | ||||
|             if not parent.get("class"): | ||||
|                 continue | ||||
|  | ||||
|             # ILIAS has proper accordions and weird blocks that look like normal headings, | ||||
|             # but some JS later transforms them into an accordion. | ||||
|  | ||||
|             # This is for these weird JS-y blocks | ||||
|             if "ilContainerItemsContainer" in parent.get("class"): | ||||
|                 # I am currently under the impression that *only* those JS blocks have an | ||||
|                 # ilNoDisplay class. | ||||
|                 if "ilNoDisplay" not in parent.get("class"): | ||||
|                     continue | ||||
|                 prev: Tag = parent.findPreviousSibling("div") | ||||
|                 if "ilContainerBlockHeader" in prev.get("class"): | ||||
|                     if prev.find("h3"): | ||||
|                         found_titles.append(prev.find("h3").getText().strip()) | ||||
|                     else: | ||||
|                         found_titles.append(prev.find("h2").getText().strip()) | ||||
|  | ||||
|             # And this for real accordions | ||||
|             if "il_VAccordionContentDef" in parent.get("class"): | ||||
|                 outer_accordion_content = parent | ||||
|                 break | ||||
|  | ||||
|         if outer_accordion_content: | ||||
|             accordion_tag: Tag = outer_accordion_content.parent | ||||
|             head_tag: Tag = accordion_tag.find(attrs={ | ||||
|                 "class": lambda x: x and "ilc_va_ihead_VAccordIHead" in x | ||||
|             }) | ||||
|             found_titles.append(head_tag.getText().strip()) | ||||
|  | ||||
|         return [_sanitize_path_name(x) for x in reversed(found_titles)] | ||||
|  | ||||
|     def _find_link_description(self, link: Tag) -> Optional[str]: | ||||
|         tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) | ||||
|         if not tile: | ||||
|             return None | ||||
|         description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x}) | ||||
|         if not description_element: | ||||
|             return None | ||||
|         return description_element.getText().strip() | ||||
|  | ||||
|     def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: | ||||
|         # Files have a list of properties (type, modification date, size, etc.) | ||||
|         # In a series of divs. | ||||
|         # Find the parent containing all those divs, so we can filter our what we need | ||||
|         properties_parent: Tag = link_element.findParent( | ||||
|             "div", {"class": lambda x: "il_ContainerListItem" in x} | ||||
|         ).select_one(".il_ItemProperties") | ||||
|         # The first one is always the filetype | ||||
|         file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() | ||||
|  | ||||
|         # The rest does not have a stable order. Grab the whole text and reg-ex the date | ||||
|         # out of it | ||||
|         all_properties_text = properties_parent.getText().strip() | ||||
|         modification_date_match = re.search( | ||||
|             r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", | ||||
|             all_properties_text | ||||
|         ) | ||||
|         if modification_date_match is None: | ||||
|             modification_date = None | ||||
|             log.explain(f"Element {name} at {url} has no date.") | ||||
|         else: | ||||
|             modification_date_str = modification_date_match.group(1) | ||||
|             modification_date = demangle_date(modification_date_str) | ||||
|  | ||||
|         # Grab the name from the link text | ||||
|         full_path = name + "." + file_type | ||||
|  | ||||
|         log.explain(f"Found file {full_path!r}") | ||||
|         return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) | ||||
|  | ||||
|     def _find_cards(self) -> List[IliasPageElement]: | ||||
|         result: List[IliasPageElement] = [] | ||||
|  | ||||
|         card_titles: List[Tag] = self._soup.select(".card-title a") | ||||
|  | ||||
|         for title in card_titles: | ||||
|             url = self._abs_url_from_link(title) | ||||
|             name = _sanitize_path_name(title.getText().strip()) | ||||
|             type = self._find_type_from_card(title) | ||||
|  | ||||
|             if not type: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not extract type for {title}") | ||||
|                 continue | ||||
|  | ||||
|             result.append(IliasPageElement(type, url, name)) | ||||
|  | ||||
|         card_button_tiles: List[Tag] = self._soup.select(".card-title button") | ||||
|  | ||||
|         for button in card_button_tiles: | ||||
|             regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") | ||||
|             res = regex.search(str(self._soup)) | ||||
|             if not res: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not find click handler target for {button}") | ||||
|                 continue | ||||
|             url = self._abs_url_from_relative(res.group(1)) | ||||
|             name = _sanitize_path_name(button.getText().strip()) | ||||
|             type = self._find_type_from_card(button) | ||||
|             caption_parent = button.findParent( | ||||
|                 "div", | ||||
|                 attrs={"class": lambda x: x and "caption" in x}, | ||||
|             ) | ||||
|             description = caption_parent.find_next_sibling("div").getText().strip() | ||||
|  | ||||
|             if not type: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not extract type for {button}") | ||||
|                 continue | ||||
|  | ||||
|             result.append(IliasPageElement(type, url, name, description=description)) | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]: | ||||
|         def is_card_root(element: Tag) -> bool: | ||||
|             return "il-card" in element["class"] and "thumbnail" in element["class"] | ||||
|  | ||||
|         card_root: Optional[Tag] = None | ||||
|  | ||||
|         # We look for the card root | ||||
|         for parent in card_title.parents: | ||||
|             if is_card_root(parent): | ||||
|                 card_root = parent | ||||
|                 break | ||||
|  | ||||
|         if card_root is None: | ||||
|             _unexpected_html_warning() | ||||
|             log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}") | ||||
|             return None | ||||
|  | ||||
|         icon: Tag = card_root.select_one(".il-card-repository-head .icon") | ||||
|  | ||||
|         if "opencast" in icon["class"]: | ||||
|             return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED | ||||
|         if "exc" in icon["class"]: | ||||
|             return IliasElementType.EXERCISE | ||||
|         if "webr" in icon["class"]: | ||||
|             return IliasElementType.LINK | ||||
|         if "book" in icon["class"]: | ||||
|             return IliasElementType.BOOKING | ||||
|         if "frm" in icon["class"]: | ||||
|             return IliasElementType.FORUM | ||||
|         if "sess" in icon["class"]: | ||||
|             return IliasElementType.MEETING | ||||
|         if "tst" in icon["class"]: | ||||
|             return IliasElementType.TEST | ||||
|         if "fold" in icon["class"]: | ||||
|             return IliasElementType.FOLDER | ||||
|  | ||||
|         _unexpected_html_warning() | ||||
|         log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") | ||||
|         return None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_type_from_link( | ||||
|             element_name: str, | ||||
|             link_element: Tag, | ||||
|             url: str | ||||
|     ) -> Optional[IliasElementType]: | ||||
|         """ | ||||
|         Decides which sub crawler to use for a given top level element. | ||||
|         """ | ||||
|         parsed_url = urlparse(url) | ||||
|  | ||||
|         # file URLs contain "target=file" | ||||
|         if "target=file_" in parsed_url.query: | ||||
|             return IliasElementType.FILE | ||||
|  | ||||
|         if "target=grp_" in parsed_url.query: | ||||
|             return IliasElementType.FOLDER | ||||
|  | ||||
|         if "target=crs_" in parsed_url.query: | ||||
|             return IliasElementType.FOLDER | ||||
|  | ||||
|         if "baseClass=ilExerciseHandlerGUI" in parsed_url.query: | ||||
|             return IliasElementType.EXERCISE | ||||
|  | ||||
|         if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query: | ||||
|             return IliasElementType.LINK | ||||
|  | ||||
|         if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query: | ||||
|             return IliasElementType.FORUM | ||||
|  | ||||
|         if "cmdClass=ilobjtestgui" in parsed_url.query: | ||||
|             return IliasElementType.TEST | ||||
|  | ||||
|         # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so | ||||
|         # try to guess it from the image. | ||||
|  | ||||
|         # Everything with a ref_id can *probably* be opened to reveal nested things | ||||
|         # video groups, directories, exercises, etc | ||||
|         if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path: | ||||
|             return IliasPage._find_type_from_folder_like(link_element, url) | ||||
|  | ||||
|         _unexpected_html_warning() | ||||
|         log.warn_contd( | ||||
|             f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})" | ||||
|         ) | ||||
|         return None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: | ||||
|         """ | ||||
|         Try crawling something that looks like a folder. | ||||
|         """ | ||||
|         # pylint: disable=too-many-return-statements | ||||
|  | ||||
|         found_parent: Optional[Tag] = None | ||||
|  | ||||
|         # We look for the outer div of our inner link, to find information around it | ||||
|         # (mostly the icon) | ||||
|         for parent in link_element.parents: | ||||
|             if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]: | ||||
|                 found_parent = parent | ||||
|                 break | ||||
|  | ||||
|         if found_parent is None: | ||||
|             _unexpected_html_warning() | ||||
|             log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}") | ||||
|             return None | ||||
|  | ||||
|         # Find the small descriptive icon to figure out the type | ||||
|         img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") | ||||
|  | ||||
|         if img_tag is None: | ||||
|             img_tag = found_parent.select_one("img.icon") | ||||
|  | ||||
|         if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): | ||||
|             log.explain("Found session expansion button, skipping it as it has no content") | ||||
|             return None | ||||
|  | ||||
|         if img_tag is None: | ||||
|             _unexpected_html_warning() | ||||
|             log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") | ||||
|             return None | ||||
|  | ||||
|         if "opencast" in str(img_tag["alt"]).lower(): | ||||
|             return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_exc.svg"): | ||||
|             return IliasElementType.EXERCISE | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_webr.svg"): | ||||
|             return IliasElementType.LINK | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_book.svg"): | ||||
|             return IliasElementType.BOOKING | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("frm.svg"): | ||||
|             return IliasElementType.FORUM | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("sess.svg"): | ||||
|             return IliasElementType.MEETING | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_tst.svg"): | ||||
|             return IliasElementType.TEST | ||||
|  | ||||
|         return IliasElementType.FOLDER | ||||
|  | ||||
|     @staticmethod | ||||
|     def _normalize_meeting_name(meeting_name: str) -> str: | ||||
|         """ | ||||
|         Normalizes meeting names, which have a relative time as their first part, | ||||
|         to their date in ISO format. | ||||
|         """ | ||||
|  | ||||
|         # This checks whether we can reach a `:` without passing a `-` | ||||
|         if re.search(r"^[^-]+: ", meeting_name): | ||||
|             # Meeting name only contains date: "05. Jan 2000:" | ||||
|             split_delimiter = ":" | ||||
|         else: | ||||
|             # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" | ||||
|             split_delimiter = ", " | ||||
|  | ||||
|         # We have a meeting day without time | ||||
|         date_portion_str = meeting_name.split(split_delimiter)[0] | ||||
|         date_portion = demangle_date(date_portion_str) | ||||
|  | ||||
|         # We failed to parse the date, bail out | ||||
|         if not date_portion: | ||||
|             return meeting_name | ||||
|  | ||||
|         # Replace the first section with the absolute date | ||||
|         rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) | ||||
|         return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name | ||||
|  | ||||
|     def _abs_url_from_link(self, link_tag: Tag) -> str: | ||||
|         """ | ||||
|         Create an absolute url from an <a> tag. | ||||
|         """ | ||||
|         return self._abs_url_from_relative(link_tag.get("href")) | ||||
|  | ||||
|     def _abs_url_from_relative(self, relative_url: str) -> str: | ||||
|         """ | ||||
|         Create an absolute url from a relative URL. | ||||
|         """ | ||||
|         return urljoin(self._page_url, relative_url) | ||||
|  | ||||
|  | ||||
| def _unexpected_html_warning() -> None: | ||||
|     log.warn("Encountered unexpected HTML structure, ignoring element.") | ||||
|  | ||||
|  | ||||
| german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] | ||||
| english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] | ||||
|  | ||||
|  | ||||
| def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: | ||||
|     """ | ||||
|     Demangle a given date in one of the following formats (hour/minute part is optional): | ||||
|     "Gestern, HH:MM" | ||||
|     "Heute, HH:MM" | ||||
|     "Morgen, HH:MM" | ||||
|     "dd. mon yyyy, HH:MM | ||||
|     """ | ||||
|     try: | ||||
|         # Normalize whitespace because users | ||||
|         date_str = re.sub(r"\s+", " ", date_str) | ||||
|  | ||||
|         date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) | ||||
|         date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) | ||||
|         date_str = re.sub("Morgen|Tomorrow",  _format_date_english(_tomorrow()), date_str, re.I) | ||||
|         date_str = date_str.strip() | ||||
|         for german, english in zip(german_months, english_months): | ||||
|             date_str = date_str.replace(german, english) | ||||
|             # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" | ||||
|             date_str = date_str.replace(english + ".", english) | ||||
|  | ||||
|         # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" | ||||
|  | ||||
|         # Check if we have a time as well | ||||
|         if ", " in date_str: | ||||
|             day_part, time_part = date_str.split(",") | ||||
|         else: | ||||
|             day_part = date_str.split(",")[0] | ||||
|             time_part = None | ||||
|  | ||||
|         day_str, month_str, year_str = day_part.split(" ") | ||||
|  | ||||
|         day = int(day_str.strip().replace(".", "")) | ||||
|         month = english_months.index(month_str.strip()) + 1 | ||||
|         year = int(year_str.strip()) | ||||
|  | ||||
|         if time_part: | ||||
|             hour_str, minute_str = time_part.split(":") | ||||
|             hour = int(hour_str) | ||||
|             minute = int(minute_str) | ||||
|             return datetime(year, month, day, hour, minute) | ||||
|  | ||||
|         return datetime(year, month, day) | ||||
|     except Exception: | ||||
|         if not fail_silently: | ||||
|             log.warn(f"Date parsing failed for {date_str!r}") | ||||
|         return None | ||||
|  | ||||
|  | ||||
| def _format_date_english(date_to_format: date) -> str: | ||||
|     month = english_months[date_to_format.month - 1] | ||||
|     return f"{date_to_format.day:02d}. {month} {date_to_format.year:04d}" | ||||
|  | ||||
|  | ||||
| def _yesterday() -> date: | ||||
|     return date.today() - timedelta(days=1) | ||||
|  | ||||
|  | ||||
| def _tomorrow() -> date: | ||||
|     return date.today() + timedelta(days=1) | ||||
|  | ||||
|  | ||||
| def _sanitize_path_name(name: str) -> str: | ||||
|     return name.replace("/", "-").replace("\\", "-").strip() | ||||
|  | ||||
|  | ||||
| def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]: | ||||
|     elements = [] | ||||
|     for p in forum_export.select("body > p"): | ||||
|         title_tag = p | ||||
|         content_tag = p.find_next_sibling("ul") | ||||
|  | ||||
|         if not content_tag: | ||||
|             # ILIAS allows users to delete the initial post while keeping the thread open | ||||
|             # This produces empty threads without *any* content. | ||||
|             # I am not sure why you would want this, but ILIAS makes it easy to do. | ||||
|             continue | ||||
|  | ||||
|         title = p.find("b").text | ||||
|         if ":" in title: | ||||
|             title = title[title.find(":") + 1:] | ||||
|         title = title.strip() | ||||
|         mtime = _guess_timestamp_from_forum_post_content(content_tag) | ||||
|         elements.append(IliasForumThread(title, title_tag, content_tag, mtime)) | ||||
|  | ||||
|     return elements | ||||
|  | ||||
|  | ||||
| def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]: | ||||
|     posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small") | ||||
|     if not posts: | ||||
|         return None | ||||
|  | ||||
|     newest_date: Optional[datetime] = None | ||||
|  | ||||
|     for post in posts: | ||||
|         text = post.text.strip() | ||||
|         text = text[text.rfind("|") + 1:] | ||||
|         date = demangle_date(text, fail_silently=True) | ||||
|         if not date: | ||||
|             continue | ||||
|  | ||||
|         if not newest_date or newest_date < date: | ||||
|             newest_date = date | ||||
|  | ||||
|     return newest_date | ||||
							
								
								
									
										969
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										969
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,969 @@ | ||||
| import asyncio | ||||
| import re | ||||
| from collections.abc import Awaitable, Coroutine | ||||
| from pathlib import PurePath | ||||
| from typing import Any, Callable, Dict, List, Optional, Set, Union, cast | ||||
|  | ||||
| import aiohttp | ||||
| import yarl | ||||
| from aiohttp import hdrs | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from ...auth import Authenticator, TfaAuthenticator | ||||
| from ...config import Config | ||||
| from ...logging import ProgressBar, log | ||||
| from ...output_dir import FileSink, Redownload | ||||
| from ...utils import fmt_path, soupify, url_set_query_param | ||||
| from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical | ||||
| from ..http_crawler import HttpCrawler, HttpCrawlerSection | ||||
| from .file_templates import Links | ||||
| from .ilias_html_cleaner import clean, insert_base_markup | ||||
| from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, | ||||
|                              _sanitize_path_name, parse_ilias_forum_export) | ||||
|  | ||||
| TargetType = Union[str, int] | ||||
|  | ||||
| _ILIAS_URL = "https://ilias.studium.kit.edu" | ||||
|  | ||||
|  | ||||
| class KitShibbolethBackgroundLoginSuccessful(): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class KitIliasWebCrawlerSection(HttpCrawlerSection): | ||||
|     def target(self) -> TargetType: | ||||
|         target = self.s.get("target") | ||||
|         if not target: | ||||
|             self.missing_value("target") | ||||
|  | ||||
|         if re.fullmatch(r"\d+", target): | ||||
|             # Course id | ||||
|             return int(target) | ||||
|         if target == "desktop": | ||||
|             # Full personal desktop | ||||
|             return target | ||||
|         if target.startswith(_ILIAS_URL): | ||||
|             # ILIAS URL | ||||
|             return target | ||||
|  | ||||
|         self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>") | ||||
|  | ||||
|     def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: | ||||
|         value: Optional[str] = self.s.get("tfa_auth") | ||||
|         if value is None: | ||||
|             return None | ||||
|         auth = authenticators.get(value) | ||||
|         if auth is None: | ||||
|             self.invalid_value("tfa_auth", value, "No such auth section exists") | ||||
|         return auth | ||||
|  | ||||
|     def links(self) -> Links: | ||||
|         type_str: Optional[str] = self.s.get("links") | ||||
|  | ||||
|         if type_str is None: | ||||
|             return Links.FANCY | ||||
|  | ||||
|         try: | ||||
|             return Links.from_string(type_str) | ||||
|         except ValueError as e: | ||||
|             self.invalid_value("links", type_str, str(e).capitalize()) | ||||
|  | ||||
|     def link_redirect_delay(self) -> int: | ||||
|         return self.s.getint("link_redirect_delay", fallback=-1) | ||||
|  | ||||
|     def videos(self) -> bool: | ||||
|         return self.s.getboolean("videos", fallback=False) | ||||
|  | ||||
|     def forums(self) -> bool: | ||||
|         return self.s.getboolean("forums", fallback=False) | ||||
|  | ||||
|  | ||||
| _DIRECTORY_PAGES: Set[IliasElementType] = set([ | ||||
|     IliasElementType.EXERCISE, | ||||
|     IliasElementType.EXERCISE_FILES, | ||||
|     IliasElementType.FOLDER, | ||||
|     IliasElementType.MEETING, | ||||
|     IliasElementType.VIDEO_FOLDER, | ||||
|     IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, | ||||
| ]) | ||||
|  | ||||
| _VIDEO_ELEMENTS: Set[IliasElementType] = set([ | ||||
|     IliasElementType.VIDEO, | ||||
|     IliasElementType.VIDEO_PLAYER, | ||||
|     IliasElementType.VIDEO_FOLDER, | ||||
|     IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, | ||||
| ]) | ||||
|  | ||||
|  | ||||
| def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | ||||
|     def decorator(f: AWrapped) -> AWrapped: | ||||
|         async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||
|             last_exception: Optional[BaseException] = None | ||||
|             for round in range(attempts): | ||||
|                 try: | ||||
|                     return await f(*args, **kwargs) | ||||
|                 except aiohttp.ContentTypeError:  # invalid content type | ||||
|                     raise CrawlWarning("ILIAS returned an invalid content type") | ||||
|                 except aiohttp.TooManyRedirects: | ||||
|                     raise CrawlWarning("Got stuck in a redirect loop") | ||||
|                 except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes | ||||
|                     last_exception = e | ||||
|                 except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc. | ||||
|                     last_exception = e | ||||
|                 except asyncio.exceptions.TimeoutError as e:  # explicit http timeouts in HttpCrawler | ||||
|                     last_exception = e | ||||
|                 log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") | ||||
|  | ||||
|             if last_exception: | ||||
|                 message = f"Error in I/O Operation: {last_exception}" | ||||
|                 if failure_is_error: | ||||
|                     raise CrawlError(message) from last_exception | ||||
|                 else: | ||||
|                     raise CrawlWarning(message) from last_exception | ||||
|             raise CrawlError("Impossible return in ilias _iorepeat") | ||||
|  | ||||
|         return wrapper  # type: ignore | ||||
|     return decorator | ||||
|  | ||||
|  | ||||
| def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]: | ||||
|     """ | ||||
|     Wraps any I/O exception in a CrawlWarning. | ||||
|     """ | ||||
|     return _iorepeat(1, name) | ||||
|  | ||||
|  | ||||
| # Crawler control flow: | ||||
| # | ||||
| #     crawl_desktop -+ | ||||
| #                    | | ||||
| #     crawl_course --+ | ||||
| #                    | | ||||
| #     @_io_repeat    |        # retries internally (before the bar) | ||||
| #  +- crawl_url    <-+ | ||||
| #  | | ||||
| #  | | ||||
| #  |  @_wrap_io_exception     # does not need to retry as children acquire bars | ||||
| #  +> crawl_ilias_element -+ | ||||
| #  ^                       | | ||||
| #  |  @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- crawl_ilias_page <---+ | ||||
| #  |                       | | ||||
| #  +> get_page             |  # Handles and retries authentication | ||||
| #                          | | ||||
| #     @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- download_link    <---+ | ||||
| #  |                       | | ||||
| #  +> resolve_target       |  # Handles and retries authentication | ||||
| #                          | | ||||
| #     @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- download_video   <---+ | ||||
| #  |                       | | ||||
| #  |  @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- download_file    <---+ | ||||
| #  | | ||||
| #  +> stream_from_url         # Handles and retries authentication | ||||
|  | ||||
| class KitIliasWebCrawler(HttpCrawler): | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: KitIliasWebCrawlerSection, | ||||
|             config: Config, | ||||
|             authenticators: Dict[str, Authenticator] | ||||
|     ): | ||||
|         # Setting a main authenticator for cookie sharing | ||||
|         auth = section.auth(authenticators) | ||||
|         super().__init__(name, section, config, shared_auth=auth) | ||||
|  | ||||
|         if section.tasks() > 1: | ||||
|             log.warn(""" | ||||
| Please avoid using too many parallel requests as these are the KIT ILIAS | ||||
| instance's greatest bottleneck. | ||||
|             """.strip()) | ||||
|  | ||||
|         self._shibboleth_login = KitShibbolethLogin( | ||||
|             auth, | ||||
|             section.tfa_auth(authenticators), | ||||
|         ) | ||||
|  | ||||
|         self._base_url = _ILIAS_URL | ||||
|  | ||||
|         self._target = section.target() | ||||
|         self._link_file_redirect_delay = section.link_redirect_delay() | ||||
|         self._links = section.links() | ||||
|         self._videos = section.videos() | ||||
|         self._forums = section.forums() | ||||
|         self._visited_urls: Set[str] = set() | ||||
|  | ||||
|     async def _run(self) -> None: | ||||
|         if isinstance(self._target, int): | ||||
|             log.explain_topic(f"Inferred crawl target: Course with id {self._target}") | ||||
|             await self._crawl_course(self._target) | ||||
|         elif self._target == "desktop": | ||||
|             log.explain_topic("Inferred crawl target: Personal desktop") | ||||
|             await self._crawl_desktop() | ||||
|         else: | ||||
|             log.explain_topic(f"Inferred crawl target: URL {self._target}") | ||||
|             await self._crawl_url(self._target) | ||||
|  | ||||
|     async def _crawl_course(self, course_id: int) -> None: | ||||
|         # Start crawling at the given course | ||||
|         root_url = url_set_query_param( | ||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" | ||||
|         ) | ||||
|  | ||||
|         await self._crawl_url(root_url, expected_id=course_id) | ||||
|  | ||||
|     async def _crawl_desktop(self) -> None: | ||||
|         appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" | ||||
|         appendix = appendix.encode("ASCII").hex() | ||||
|         await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) | ||||
|  | ||||
|     async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: | ||||
|         maybe_cl = await self.crawl(PurePath(".")) | ||||
|         if not maybe_cl: | ||||
|             return | ||||
|         cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 | ||||
|  | ||||
|         elements: List[IliasPageElement] = [] | ||||
|         # A list as variable redefinitions are not propagated to outer scopes | ||||
|         description: List[BeautifulSoup] = [] | ||||
|  | ||||
|         @_iorepeat(3, "crawling url") | ||||
|         async def gather_elements() -> None: | ||||
|             elements.clear() | ||||
|             async with cl: | ||||
|                 next_stage_url: Optional[str] = url | ||||
|                 current_parent = None | ||||
|  | ||||
|                 # Duplicated code, but the root page is special - we want to avoid fetching it twice! | ||||
|                 while next_stage_url: | ||||
|                     soup = await self._get_page(next_stage_url) | ||||
|  | ||||
|                     if current_parent is None and expected_id is not None: | ||||
|                         perma_link_element: Tag = soup.find(id="current_perma_link") | ||||
|                         if not perma_link_element or "crs_" not in perma_link_element.get("value"): | ||||
|                             raise CrawlError("Invalid course id? Didn't find anything looking like a course") | ||||
|  | ||||
|                     log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||
|                     log.explain(f"URL: {next_stage_url}") | ||||
|                     page = IliasPage(soup, next_stage_url, current_parent) | ||||
|                     if next_element := page.get_next_stage_element(): | ||||
|                         current_parent = next_element | ||||
|                         next_stage_url = next_element.url | ||||
|                     else: | ||||
|                         next_stage_url = None | ||||
|  | ||||
|                 elements.extend(page.get_child_elements()) | ||||
|                 if description_string := page.get_description(): | ||||
|                     description.append(description_string) | ||||
|  | ||||
|         # Fill up our task list with the found elements | ||||
|         await gather_elements() | ||||
|  | ||||
|         if description: | ||||
|             await self._download_description(PurePath("."), description[0]) | ||||
|  | ||||
|         elements.sort(key=lambda e: e.id()) | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|         for element in elements: | ||||
|             if handle := await self._handle_ilias_element(PurePath("."), element): | ||||
|                 tasks.append(asyncio.create_task(handle)) | ||||
|  | ||||
|         # And execute them | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _handle_ilias_page( | ||||
|         self, | ||||
|         url: str, | ||||
|         parent: IliasPageElement, | ||||
|         path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         maybe_cl = await self.crawl(path) | ||||
|         if not maybe_cl: | ||||
|             return None | ||||
|         return self._crawl_ilias_page(url, parent, maybe_cl) | ||||
|  | ||||
|     @anoncritical | ||||
|     async def _crawl_ilias_page( | ||||
|         self, | ||||
|         url: str, | ||||
|         parent: IliasPageElement, | ||||
|         cl: CrawlToken, | ||||
|     ) -> None: | ||||
|         elements: List[IliasPageElement] = [] | ||||
|         # A list as variable redefinitions are not propagated to outer scopes | ||||
|         description: List[BeautifulSoup] = [] | ||||
|  | ||||
|         @_iorepeat(3, "crawling folder") | ||||
|         async def gather_elements() -> None: | ||||
|             elements.clear() | ||||
|             async with cl: | ||||
|                 next_stage_url: Optional[str] = url | ||||
|                 current_parent = parent | ||||
|  | ||||
|                 while next_stage_url: | ||||
|                     soup = await self._get_page(next_stage_url) | ||||
|                     log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||
|                     log.explain(f"URL: {next_stage_url}") | ||||
|                     page = IliasPage(soup, next_stage_url, current_parent) | ||||
|                     if next_element := page.get_next_stage_element(): | ||||
|                         current_parent = next_element | ||||
|                         next_stage_url = next_element.url | ||||
|                     else: | ||||
|                         next_stage_url = None | ||||
|  | ||||
|                 elements.extend(page.get_child_elements()) | ||||
|                 if description_string := page.get_description(): | ||||
|                     description.append(description_string) | ||||
|  | ||||
|         # Fill up our task list with the found elements | ||||
|         await gather_elements() | ||||
|  | ||||
|         if description: | ||||
|             await self._download_description(cl.path, description[0]) | ||||
|  | ||||
|         elements.sort(key=lambda e: e.id()) | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|         for element in elements: | ||||
|             if handle := await self._handle_ilias_element(cl.path, element): | ||||
|                 tasks.append(asyncio.create_task(handle)) | ||||
|  | ||||
|         # And execute them | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     # These decorators only apply *to this method* and *NOT* to the returned | ||||
|     # awaitables! | ||||
|     # This method does not await the handlers but returns them instead. | ||||
|     # This ensures one level is handled at a time and name deduplication | ||||
|     # works correctly. | ||||
|     @anoncritical | ||||
|     async def _handle_ilias_element( | ||||
|         self, | ||||
|         parent_path: PurePath, | ||||
|         element: IliasPageElement, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         if element.url in self._visited_urls: | ||||
|             raise CrawlWarning( | ||||
|                 f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" | ||||
|             ) | ||||
|         self._visited_urls.add(element.url) | ||||
|  | ||||
|         element_path = PurePath(parent_path, element.name) | ||||
|  | ||||
|         if element.type in _VIDEO_ELEMENTS: | ||||
|             if not self._videos: | ||||
|                 log.status( | ||||
|                     "[bold bright_black]", | ||||
|                     "Ignored", | ||||
|                     fmt_path(element_path), | ||||
|                     "[bright_black](enable with option 'videos')" | ||||
|                 ) | ||||
|                 return None | ||||
|  | ||||
|         if element.type == IliasElementType.FILE: | ||||
|             return await self._handle_file(element, element_path) | ||||
|         elif element.type == IliasElementType.FORUM: | ||||
|             if not self._forums: | ||||
|                 log.status( | ||||
|                     "[bold bright_black]", | ||||
|                     "Ignored", | ||||
|                     fmt_path(element_path), | ||||
|                     "[bright_black](enable with option 'forums')" | ||||
|                 ) | ||||
|                 return None | ||||
|             return await self._handle_forum(element, element_path) | ||||
|         elif element.type == IliasElementType.TEST: | ||||
|             log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") | ||||
|             log.explain("Tests contain no relevant files") | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|         elif element.type == IliasElementType.LINK: | ||||
|             return await self._handle_link(element, element_path) | ||||
|         elif element.type == IliasElementType.BOOKING: | ||||
|             return await self._handle_booking(element, element_path) | ||||
|         elif element.type == IliasElementType.VIDEO: | ||||
|             return await self._handle_file(element, element_path) | ||||
|         elif element.type == IliasElementType.VIDEO_PLAYER: | ||||
|             return await self._handle_video(element, element_path) | ||||
|         elif element.type in _DIRECTORY_PAGES: | ||||
|             return await self._handle_ilias_page(element.url, element, element_path) | ||||
|         else: | ||||
|             # This will retry it a few times, failing everytime. It doesn't make any network | ||||
|             # requests, so that's fine. | ||||
|             raise CrawlWarning(f"Unknown element type: {element.type!r}") | ||||
|  | ||||
|     async def _handle_link( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") | ||||
|         log.explain(f"Links type is {self._links}") | ||||
|  | ||||
|         link_template_maybe = self._links.template() | ||||
|         link_extension = self._links.extension() | ||||
|         if not link_template_maybe or not link_extension: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|         else: | ||||
|             log.explain("Answer: Yes") | ||||
|         element_path = element_path.with_name(element_path.name + link_extension) | ||||
|  | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return None | ||||
|  | ||||
|         return self._download_link(element, link_template_maybe, maybe_dl) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "resolving link") | ||||
|     async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: | ||||
|         async with dl as (bar, sink): | ||||
|             export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") | ||||
|             real_url = await self._resolve_link_target(export_url) | ||||
|             self._write_link_content(link_template, real_url, element.name, element.description, sink) | ||||
|  | ||||
|     def _write_link_content( | ||||
|         self, | ||||
|         link_template: str, | ||||
|         url: str, | ||||
|         name: str, | ||||
|         description: Optional[str], | ||||
|         sink: FileSink, | ||||
|     ) -> None: | ||||
|         content = link_template | ||||
|         content = content.replace("{{link}}", url) | ||||
|         content = content.replace("{{name}}", name) | ||||
|         content = content.replace("{{description}}", str(description)) | ||||
|         content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) | ||||
|         sink.file.write(content.encode("utf-8")) | ||||
|         sink.done() | ||||
|  | ||||
|     async def _handle_booking( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}") | ||||
|         log.explain(f"Links type is {self._links}") | ||||
|  | ||||
|         link_template_maybe = self._links.template() | ||||
|         link_extension = self._links.extension() | ||||
|         if not link_template_maybe or not link_extension: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|         else: | ||||
|             log.explain("Answer: Yes") | ||||
|         element_path = element_path.with_name(element_path.name + link_extension) | ||||
|  | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return None | ||||
|  | ||||
|         return self._download_booking(element, link_template_maybe, maybe_dl) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(1, "downloading description") | ||||
|     async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None: | ||||
|         path = parent_path / "Description.html" | ||||
|         dl = await self.download(path, redownload=Redownload.ALWAYS) | ||||
|         if not dl: | ||||
|             return | ||||
|  | ||||
|         async with dl as (bar, sink): | ||||
|             description = clean(insert_base_markup(description)) | ||||
|             sink.file.write(description.prettify().encode("utf-8")) | ||||
|             sink.done() | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "resolving booking") | ||||
|     async def _download_booking( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         link_template: str, | ||||
|         dl: DownloadToken, | ||||
|     ) -> None: | ||||
|         async with dl as (bar, sink): | ||||
|             self._write_link_content(link_template, element.url, element.name, element.description, sink) | ||||
|  | ||||
|     async def _resolve_link_target(self, export_url: str) -> str: | ||||
|         async with self.session.get(export_url, allow_redirects=False) as resp: | ||||
|             # No redirect means we were authenticated | ||||
|             if hdrs.LOCATION not in resp.headers: | ||||
|                 return soupify(await resp.read()).select_one("a").get("href").strip() | ||||
|  | ||||
|         await self._authenticate() | ||||
|  | ||||
|         async with self.session.get(export_url, allow_redirects=False) as resp: | ||||
|             # No redirect means we were authenticated | ||||
|             if hdrs.LOCATION not in resp.headers: | ||||
|                 return soupify(await resp.read()).select_one("a").get("href").strip() | ||||
|  | ||||
|         raise CrawlError("resolve_link_target failed even after authenticating") | ||||
|  | ||||
|     async def _handle_video( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         # Copy old mapping as it is likely still relevant | ||||
|         if self.prev_report: | ||||
|             self.report.add_custom_value( | ||||
|                 str(element_path), | ||||
|                 self.prev_report.get_custom_value(str(element_path)) | ||||
|             ) | ||||
|  | ||||
|         # A video might contain other videos, so let's "crawl" the video first | ||||
|         # to ensure rate limits apply. This must be a download as *this token* | ||||
|         # is re-used if the video consists of a single stream. In that case the | ||||
|         # file name is used and *not* the stream name the ilias html parser reported | ||||
|         # to ensure backwards compatibility. | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) | ||||
|  | ||||
|         # If we do not want to crawl it (user filter) or we have every file | ||||
|         # from the cached mapping already, we can ignore this and bail | ||||
|         if not maybe_dl or self._all_videos_locally_present(element_path): | ||||
|             # Mark all existing cideos as known so they do not get deleted | ||||
|             # during dleanup. We "downloaded" them, just without actually making | ||||
|             # a network request as we assumed they did not change. | ||||
|             for video in self._previous_contained_videos(element_path): | ||||
|                 await self.download(video) | ||||
|  | ||||
|             return None | ||||
|  | ||||
|         return self._download_video(element_path, element, maybe_dl) | ||||
|  | ||||
|     def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]: | ||||
|         if not self.prev_report: | ||||
|             return [] | ||||
|         custom_value = self.prev_report.get_custom_value(str(video_path)) | ||||
|         if not custom_value: | ||||
|             return [] | ||||
|         names = cast(List[str], custom_value) | ||||
|         folder = video_path.parent | ||||
|         return [PurePath(folder, name) for name in names] | ||||
|  | ||||
|     def _all_videos_locally_present(self, video_path: PurePath) -> bool: | ||||
|         if contained_videos := self._previous_contained_videos(video_path): | ||||
|             log.explain_topic(f"Checking local cache for video {video_path.name}") | ||||
|             all_found_locally = True | ||||
|             for video in contained_videos: | ||||
|                 transformed_path = self._to_local_video_path(video) | ||||
|                 if transformed_path: | ||||
|                     exists_locally = self._output_dir.resolve(transformed_path).exists() | ||||
|                     all_found_locally = all_found_locally and exists_locally | ||||
|             if all_found_locally: | ||||
|                 log.explain("Found all videos locally, skipping enumeration request") | ||||
|                 return True | ||||
|             log.explain("Missing at least one video, continuing with requests!") | ||||
|         return False | ||||
|  | ||||
|     def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]: | ||||
|         if transformed := self._transformer.transform(path): | ||||
|             return self._deduplicator.fixup_path(transformed) | ||||
|         return None | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "downloading video") | ||||
|     async def _download_video( | ||||
|         self, | ||||
|         original_path: PurePath, | ||||
|         element: IliasPageElement, | ||||
|         dl: DownloadToken | ||||
|     ) -> None: | ||||
|         stream_elements: List[IliasPageElement] = [] | ||||
|         async with dl as (bar, sink): | ||||
|             page = IliasPage(await self._get_page(element.url), element.url, element) | ||||
|             stream_elements = page.get_child_elements() | ||||
|  | ||||
|             if len(stream_elements) > 1: | ||||
|                 log.explain(f"Found multiple video streams for {element.name}") | ||||
|             else: | ||||
|                 log.explain(f"Using single video mode for {element.name}") | ||||
|                 stream_element = stream_elements[0] | ||||
|  | ||||
|                 transformed_path = self._to_local_video_path(original_path) | ||||
|                 if not transformed_path: | ||||
|                     raise CrawlError(f"Download returned a path but transform did not for {original_path}") | ||||
|  | ||||
|                 # We do not have a local cache yet | ||||
|                 if self._output_dir.resolve(transformed_path).exists(): | ||||
|                     log.explain(f"Video for {element.name} existed locally") | ||||
|                 else: | ||||
|                     await self._stream_from_url(stream_element.url, sink, bar, is_video=True) | ||||
|                 self.report.add_custom_value(str(original_path), [original_path.name]) | ||||
|                 return | ||||
|  | ||||
|         contained_video_paths: List[str] = [] | ||||
|  | ||||
|         for stream_element in stream_elements: | ||||
|             video_path = original_path.parent / stream_element.name | ||||
|             contained_video_paths.append(str(video_path)) | ||||
|  | ||||
|             maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) | ||||
|             if not maybe_dl: | ||||
|                 continue | ||||
|             async with maybe_dl as (bar, sink): | ||||
|                 log.explain(f"Streaming video from real url {stream_element.url}") | ||||
|                 await self._stream_from_url(stream_element.url, sink, bar, is_video=True) | ||||
|  | ||||
|         self.report.add_custom_value(str(original_path), contained_video_paths) | ||||
|  | ||||
|     async def _handle_file( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return None | ||||
|         return self._download_file(element, maybe_dl) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "downloading file") | ||||
|     async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: | ||||
|         assert dl  # The function is only reached when dl is not None | ||||
|         async with dl as (bar, sink): | ||||
|             await self._stream_from_url(element.url, sink, bar, is_video=False) | ||||
|  | ||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: | ||||
|         async def try_stream() -> bool: | ||||
|             async with self.session.get(url, allow_redirects=is_video) as resp: | ||||
|                 if not is_video: | ||||
|                     # Redirect means we weren't authenticated | ||||
|                     if hdrs.LOCATION in resp.headers: | ||||
|                         return False | ||||
|                 # we wanted a video but got HTML | ||||
|                 if is_video and "html" in resp.content_type: | ||||
|                     return False | ||||
|  | ||||
|                 if resp.content_length: | ||||
|                     bar.set_total(resp.content_length) | ||||
|  | ||||
|                 async for data in resp.content.iter_chunked(1024): | ||||
|                     sink.file.write(data) | ||||
|                     bar.advance(len(data)) | ||||
|  | ||||
|                 sink.done() | ||||
|             return True | ||||
|  | ||||
|         auth_id = await self._current_auth_id() | ||||
|         if await try_stream(): | ||||
|             return | ||||
|  | ||||
|         await self.authenticate(auth_id) | ||||
|  | ||||
|         if not await try_stream(): | ||||
|             raise CrawlError("File streaming failed after authenticate()") | ||||
|  | ||||
|     async def _handle_forum( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         maybe_cl = await self.crawl(element_path) | ||||
|         if not maybe_cl: | ||||
|             return None | ||||
|         return self._crawl_forum(element, maybe_cl) | ||||
|  | ||||
|     @_iorepeat(3, "crawling forum") | ||||
|     @anoncritical | ||||
|     async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: | ||||
|         elements: List[IliasForumThread] = [] | ||||
|  | ||||
|         async with cl: | ||||
|             next_stage_url = element.url | ||||
|             while next_stage_url: | ||||
|                 log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||
|                 log.explain(f"URL: {next_stage_url}") | ||||
|  | ||||
|                 soup = await self._get_page(next_stage_url) | ||||
|                 page = IliasPage(soup, next_stage_url, None) | ||||
|  | ||||
|                 if next := page.get_next_stage_element(): | ||||
|                     next_stage_url = next.url | ||||
|                 else: | ||||
|                     break | ||||
|  | ||||
|             download_data = page.get_download_forum_data() | ||||
|             if not download_data: | ||||
|                 raise CrawlWarning("Failed to extract forum data") | ||||
|             if download_data.empty: | ||||
|                 log.explain("Forum had no threads") | ||||
|                 elements = [] | ||||
|                 return | ||||
|             html = await self._post_authenticated(download_data.url, download_data.form_data) | ||||
|             elements = parse_ilias_forum_export(soupify(html)) | ||||
|  | ||||
|         elements.sort(key=lambda elem: elem.title) | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|         for elem in elements: | ||||
|             tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) | ||||
|  | ||||
|         # And execute them | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "saving forum thread") | ||||
|     async def _download_forum_thread( | ||||
|         self, | ||||
|         parent_path: PurePath, | ||||
|         element: IliasForumThread, | ||||
|     ) -> None: | ||||
|         path = parent_path / (_sanitize_path_name(element.title) + ".html") | ||||
|         maybe_dl = await self.download(path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return | ||||
|  | ||||
|         async with maybe_dl as (bar, sink): | ||||
|             content = element.title_tag.prettify() | ||||
|             content += element.content_tag.prettify() | ||||
|             sink.file.write(content.encode("utf-8")) | ||||
|             sink.done() | ||||
|  | ||||
|     async def _get_page(self, url: str) -> BeautifulSoup: | ||||
|         auth_id = await self._current_auth_id() | ||||
|         async with self.session.get(url) as request: | ||||
|             soup = soupify(await request.read()) | ||||
|             if self._is_logged_in(soup): | ||||
|                 return soup | ||||
|  | ||||
|         # We weren't authenticated, so try to do that | ||||
|         await self.authenticate(auth_id) | ||||
|  | ||||
|         # Retry once after authenticating. If this fails, we will die. | ||||
|         async with self.session.get(url) as request: | ||||
|             soup = soupify(await request.read()) | ||||
|             if self._is_logged_in(soup): | ||||
|                 return soup | ||||
|         raise CrawlError("get_page failed even after authenticating") | ||||
|  | ||||
|     async def _post_authenticated( | ||||
|         self, | ||||
|         url: str, | ||||
|         data: dict[str, Union[str, List[str]]] | ||||
|     ) -> BeautifulSoup: | ||||
|         auth_id = await self._current_auth_id() | ||||
|  | ||||
|         form_data = aiohttp.FormData() | ||||
|         for key, val in data.items(): | ||||
|             form_data.add_field(key, val) | ||||
|  | ||||
|         async with self.session.post(url, data=form_data(), allow_redirects=False) as request: | ||||
|             if request.status == 200: | ||||
|                 return await request.read() | ||||
|  | ||||
|         # We weren't authenticated, so try to do that | ||||
|         await self.authenticate(auth_id) | ||||
|  | ||||
|         # Retry once after authenticating. If this fails, we will die. | ||||
|         async with self.session.post(url, data=data, allow_redirects=False) as request: | ||||
|             if request.status == 200: | ||||
|                 return await request.read() | ||||
|         raise CrawlError("post_authenticated failed even after authenticating") | ||||
|  | ||||
|     # We repeat this as the login method in shibboleth doesn't handle I/O errors. | ||||
|     # Shibboleth is quite reliable as well, the repeat is likely not critical here. | ||||
|     @ _iorepeat(3, "Login", failure_is_error=True) | ||||
|     async def _authenticate(self) -> None: | ||||
|         await self._shibboleth_login.login(self.session) | ||||
|  | ||||
|     @ staticmethod | ||||
|     def _is_logged_in(soup: BeautifulSoup) -> bool: | ||||
|         # Normal ILIAS pages | ||||
|         mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") | ||||
|         if mainbar is not None: | ||||
|             login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) | ||||
|             shib_login = soup.find(id="button_shib_login") | ||||
|             return not login_button and not shib_login | ||||
|  | ||||
|         # Personal Desktop | ||||
|         if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): | ||||
|             return True | ||||
|  | ||||
|         # Video listing embeds do not have complete ILIAS html. Try to match them by | ||||
|         # their video listing table | ||||
|         video_table = soup.find( | ||||
|             recursive=True, | ||||
|             name="table", | ||||
|             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} | ||||
|         ) | ||||
|         if video_table is not None: | ||||
|             return True | ||||
|         # The individual video player wrapper page has nothing of the above. | ||||
|         # Match it by its playerContainer. | ||||
|         if soup.select_one("#playerContainer") is not None: | ||||
|             return True | ||||
|         return False | ||||
|  | ||||
|  | ||||
| class KitShibbolethLogin: | ||||
|     """ | ||||
|     Login via KIT's shibboleth system. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator]) -> None: | ||||
|         self._auth = authenticator | ||||
|         self._tfa_auth = tfa_authenticator | ||||
|  | ||||
|     async def login(self, sess: aiohttp.ClientSession) -> None: | ||||
|         """ | ||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||
|         cookies it receieves. | ||||
|  | ||||
|         This function should only be called whenever it is detected that you're | ||||
|         not logged in. The cookies obtained should be good for a few minutes, | ||||
|         maybe even an hour or two. | ||||
|         """ | ||||
|  | ||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||
|         # https://ilias.studium.kit.edu/login.php | ||||
|         url = f"{_ILIAS_URL}/shib_login.php" | ||||
|         data = { | ||||
|             "sendLogin": "1", | ||||
|             "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", | ||||
|             "il_target": "", | ||||
|             "home_organization_selection": "Weiter", | ||||
|         } | ||||
|         soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data) | ||||
|  | ||||
|         if isinstance(soup, KitShibbolethBackgroundLoginSuccessful): | ||||
|             return | ||||
|  | ||||
|         # Attempt to login using credentials, if necessary | ||||
|         while not self._login_successful(soup): | ||||
|             # Searching the form here so that this fails before asking for | ||||
|             # credentials rather than after asking. | ||||
|             form = soup.find("form", {"class": "full content", "method": "post"}) | ||||
|             action = form["action"] | ||||
|  | ||||
|             csrf_token = form.find("input", {"name": "csrf_token"})["value"] | ||||
|  | ||||
|             # Equivalent: Enter credentials in | ||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|             url = "https://idp.scc.kit.edu" + action | ||||
|             username, password = await self._auth.credentials() | ||||
|             data = { | ||||
|                 "_eventId_proceed": "", | ||||
|                 "j_username": username, | ||||
|                 "j_password": password, | ||||
|                 "csrf_token": csrf_token | ||||
|             } | ||||
|             soup = await _post(sess, url, data) | ||||
|  | ||||
|             if soup.find(id="attributeRelease"): | ||||
|                 raise CrawlError( | ||||
|                     "ILIAS Shibboleth entitlements changed! " | ||||
|                     "Please log in once in your browser and review them" | ||||
|                 ) | ||||
|  | ||||
|             if self._tfa_required(soup): | ||||
|                 soup = await self._authenticate_tfa(sess, soup) | ||||
|  | ||||
|             if not self._login_successful(soup): | ||||
|                 self._auth.invalidate_credentials() | ||||
|  | ||||
|         # Equivalent: Being redirected via JS automatically | ||||
|         # (or clicking "Continue" if you have JS disabled) | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST" | ||||
|         data = {  # using the info obtained in the while loop above | ||||
|             "RelayState": relay_state["value"], | ||||
|             "SAMLResponse": saml_response["value"], | ||||
|         } | ||||
|         await sess.post(url, data=data) | ||||
|  | ||||
|     async def _authenticate_tfa( | ||||
|             self, | ||||
|             session: aiohttp.ClientSession, | ||||
|             soup: BeautifulSoup | ||||
|     ) -> BeautifulSoup: | ||||
|         if not self._tfa_auth: | ||||
|             self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") | ||||
|  | ||||
|         tfa_token = await self._tfa_auth.password() | ||||
|  | ||||
|         # Searching the form here so that this fails before asking for | ||||
|         # credentials rather than after asking. | ||||
|         form = soup.find("form", {"method": "post"}) | ||||
|         action = form["action"] | ||||
|         csrf_token = form.find("input", {"name": "csrf_token"})["value"] | ||||
|  | ||||
|         # Equivalent: Enter token in | ||||
|         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|         url = "https://idp.scc.kit.edu" + action | ||||
|         data = { | ||||
|             "_eventId_proceed": "", | ||||
|             "j_tokenNumber": tfa_token, | ||||
|             "csrf_token": csrf_token | ||||
|         } | ||||
|         return await _post(session, url, data) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _login_successful(soup: BeautifulSoup) -> bool: | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         return relay_state is not None and saml_response is not None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _tfa_required(soup: BeautifulSoup) -> bool: | ||||
|         return soup.find(id="j_tokenNumber") is not None | ||||
|  | ||||
|  | ||||
| async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: | ||||
|     async with session.post(url, data=data) as response: | ||||
|         return soupify(await response.read()) | ||||
|  | ||||
|  | ||||
| async def _shib_post( | ||||
|     session: aiohttp.ClientSession, | ||||
|     url: str, | ||||
|     data: Any | ||||
| ) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]: | ||||
|     """ | ||||
|     aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected | ||||
|     by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and | ||||
|     build encoded URL objects ourselves... Who thought mangling location header was a good idea?? | ||||
|     """ | ||||
|     log.explain_topic("Shib login POST") | ||||
|     async with session.post(url, data=data, allow_redirects=False) as response: | ||||
|         location = response.headers.get("location") | ||||
|         log.explain(f"Got location {location!r}") | ||||
|         if not location: | ||||
|             raise CrawlWarning(f"Login failed (1), no location header present at {url}") | ||||
|         correct_url = yarl.URL(location, encoded=True) | ||||
|         log.explain(f"Corrected location to {correct_url!r}") | ||||
|  | ||||
|         if str(correct_url).startswith(_ILIAS_URL): | ||||
|             log.explain("ILIAS recognized our shib token and logged us in in the background, returning") | ||||
|             return KitShibbolethBackgroundLoginSuccessful() | ||||
|  | ||||
|         async with session.get(correct_url, allow_redirects=False) as response: | ||||
|             location = response.headers.get("location") | ||||
|             log.explain(f"Redirected to {location!r} with status {response.status}") | ||||
|             # If shib still still has a valid session, it will directly respond to the request | ||||
|             if location is None: | ||||
|                 log.explain("Shib recognized us, returning its response directly") | ||||
|                 return soupify(await response.read()) | ||||
|  | ||||
|             as_yarl = yarl.URL(response.url) | ||||
|             # Probably not needed anymore, but might catch a few weird situations with a nicer message | ||||
|             if not location or not as_yarl.host: | ||||
|                 raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}") | ||||
|  | ||||
|             correct_url = yarl.URL.build( | ||||
|                 scheme=as_yarl.scheme, | ||||
|                 host=as_yarl.host, | ||||
|                 path=location, | ||||
|                 encoded=True | ||||
|             ) | ||||
|             log.explain(f"Corrected location to {correct_url!r}") | ||||
|  | ||||
|             async with session.get(correct_url, allow_redirects=False) as response: | ||||
|                 return soupify(await response.read()) | ||||
							
								
								
									
										170
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										170
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,170 @@ | ||||
| import os | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from pathlib import PurePath | ||||
| from typing import Awaitable, List, Optional, Pattern, Set, Union | ||||
| from urllib.parse import urljoin | ||||
|  | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from ..config import Config | ||||
| from ..logging import ProgressBar, log | ||||
| from ..output_dir import FileSink | ||||
| from ..utils import soupify | ||||
| from .crawler import CrawlError | ||||
| from .http_crawler import HttpCrawler, HttpCrawlerSection | ||||
|  | ||||
|  | ||||
| class KitIpdCrawlerSection(HttpCrawlerSection): | ||||
|     def target(self) -> str: | ||||
|         target = self.s.get("target") | ||||
|         if not target: | ||||
|             self.missing_value("target") | ||||
|  | ||||
|         if not target.startswith("https://"): | ||||
|             self.invalid_value("target", target, "Should be a URL") | ||||
|  | ||||
|         return target | ||||
|  | ||||
|     def link_regex(self) -> Pattern[str]: | ||||
|         regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") | ||||
|         return re.compile(regex) | ||||
|  | ||||
|  | ||||
| @dataclass(unsafe_hash=True) | ||||
| class KitIpdFile: | ||||
|     name: str | ||||
|     url: str | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class KitIpdFolder: | ||||
|     name: str | ||||
|     files: List[KitIpdFile] | ||||
|  | ||||
|     def explain(self) -> None: | ||||
|         log.explain_topic(f"Folder {self.name!r}") | ||||
|         for file in self.files: | ||||
|             log.explain(f"File {file.name!r} (href={file.url!r})") | ||||
|  | ||||
|     def __hash__(self) -> int: | ||||
|         return self.name.__hash__() | ||||
|  | ||||
|  | ||||
| class KitIpdCrawler(HttpCrawler): | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: KitIpdCrawlerSection, | ||||
|             config: Config, | ||||
|     ): | ||||
|         super().__init__(name, section, config) | ||||
|         self._url = section.target() | ||||
|         self._file_regex = section.link_regex() | ||||
|  | ||||
|     async def _run(self) -> None: | ||||
|         maybe_cl = await self.crawl(PurePath(".")) | ||||
|         if not maybe_cl: | ||||
|             return | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|  | ||||
|         async with maybe_cl: | ||||
|             for item in await self._fetch_items(): | ||||
|                 if isinstance(item, KitIpdFolder): | ||||
|                     tasks.append(self._crawl_folder(item)) | ||||
|                 else: | ||||
|                     # Orphan files are placed in the root folder | ||||
|                     tasks.append(self._download_file(PurePath("."), item)) | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _crawl_folder(self, folder: KitIpdFolder) -> None: | ||||
|         path = PurePath(folder.name) | ||||
|         if not await self.crawl(path): | ||||
|             return | ||||
|  | ||||
|         tasks = [self._download_file(path, file) for file in folder.files] | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: | ||||
|         element_path = parent / file.name | ||||
|         maybe_dl = await self.download(element_path) | ||||
|         if not maybe_dl: | ||||
|             return | ||||
|  | ||||
|         async with maybe_dl as (bar, sink): | ||||
|             await self._stream_from_url(file.url, sink, bar) | ||||
|  | ||||
|     async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: | ||||
|         page = await self.get_page() | ||||
|         elements: List[Tag] = self._find_file_links(page) | ||||
|         items: Set[Union[KitIpdFile, KitIpdFolder]] = set() | ||||
|  | ||||
|         for element in elements: | ||||
|             folder_label = self._find_folder_label(element) | ||||
|             if folder_label: | ||||
|                 folder = self._extract_folder(folder_label) | ||||
|                 if folder not in items: | ||||
|                     items.add(folder) | ||||
|                     folder.explain() | ||||
|             else: | ||||
|                 file = self._extract_file(element) | ||||
|                 items.add(file) | ||||
|                 log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") | ||||
|                 log.explain("Attributing it to root folder") | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: | ||||
|         files: List[KitIpdFile] = [] | ||||
|         name = folder_tag.getText().strip() | ||||
|  | ||||
|         container: Tag = folder_tag.findNextSibling(name="table") | ||||
|         for link in self._find_file_links(container): | ||||
|             files.append(self._extract_file(link)) | ||||
|  | ||||
|         return KitIpdFolder(name, files) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_folder_label(file_link: Tag) -> Optional[Tag]: | ||||
|         enclosing_table: Tag = file_link.findParent(name="table") | ||||
|         if enclosing_table is None: | ||||
|             return None | ||||
|         return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) | ||||
|  | ||||
|     def _extract_file(self, link: Tag) -> KitIpdFile: | ||||
|         url = self._abs_url_from_link(link) | ||||
|         name = os.path.basename(url) | ||||
|         return KitIpdFile(name, url) | ||||
|  | ||||
|     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: | ||||
|         return tag.findAll(name="a", attrs={"href": self._file_regex}) | ||||
|  | ||||
|     def _abs_url_from_link(self, link_tag: Tag) -> str: | ||||
|         return urljoin(self._url, link_tag.get("href")) | ||||
|  | ||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: | ||||
|         async with self.session.get(url, allow_redirects=False) as resp: | ||||
|             if resp.status == 403: | ||||
|                 raise CrawlError("Received a 403. Are you within the KIT network/VPN?") | ||||
|             if resp.content_length: | ||||
|                 bar.set_total(resp.content_length) | ||||
|  | ||||
|             async for data in resp.content.iter_chunked(1024): | ||||
|                 sink.file.write(data) | ||||
|                 bar.advance(len(data)) | ||||
|  | ||||
|             sink.done() | ||||
|  | ||||
|     async def get_page(self) -> BeautifulSoup: | ||||
|         async with self.session.get(self._url) as request: | ||||
|             # The web page for Algorithmen für Routenplanung contains some | ||||
|             # weird comments that beautifulsoup doesn't parse correctly. This | ||||
|             # hack enables those pages to be crawled, and should hopefully not | ||||
|             # cause issues on other pages. | ||||
|             content = (await request.read()).decode("utf-8") | ||||
|             content = re.sub(r"<!--.*?-->", "", content) | ||||
|             return soupify(content.encode("utf-8")) | ||||
							
								
								
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,117 @@ | ||||
| import asyncio | ||||
| import datetime | ||||
| import random | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Optional | ||||
|  | ||||
| from ..config import Config | ||||
| from .crawler import Crawler, CrawlerSection, anoncritical | ||||
|  | ||||
|  | ||||
| class LocalCrawlerSection(CrawlerSection): | ||||
|     def target(self) -> Path: | ||||
|         value = self.s.get("target") | ||||
|         if value is None: | ||||
|             self.missing_value("target") | ||||
|         return Path(value).expanduser() | ||||
|  | ||||
|     def crawl_delay(self) -> float: | ||||
|         value = self.s.getfloat("crawl_delay", fallback=0.0) | ||||
|         if value < 0: | ||||
|             self.invalid_value("crawl_delay", value, | ||||
|                                "Must not be negative") | ||||
|         return value | ||||
|  | ||||
|     def download_delay(self) -> float: | ||||
|         value = self.s.getfloat("download_delay", fallback=0.0) | ||||
|         if value < 0: | ||||
|             self.invalid_value("download_delay", value, | ||||
|                                "Must not be negative") | ||||
|         return value | ||||
|  | ||||
|     def download_speed(self) -> Optional[int]: | ||||
|         value = self.s.getint("download_speed") | ||||
|         if value is not None and value <= 0: | ||||
|             self.invalid_value("download_speed", value, | ||||
|                                "Must be greater than 0") | ||||
|         return value | ||||
|  | ||||
|  | ||||
| class LocalCrawler(Crawler): | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: LocalCrawlerSection, | ||||
|             config: Config, | ||||
|     ): | ||||
|         super().__init__(name, section, config) | ||||
|  | ||||
|         self._target = config.default_section.working_dir() / section.target() | ||||
|         self._crawl_delay = section.crawl_delay() | ||||
|         self._download_delay = section.download_delay() | ||||
|         self._download_speed = section.download_speed() | ||||
|  | ||||
|         if self._download_speed: | ||||
|             self._block_size = self._download_speed // 10 | ||||
|         else: | ||||
|             self._block_size = 1024**2  # 1 MiB | ||||
|  | ||||
|     async def _run(self) -> None: | ||||
|         await self._crawl_path(self._target, PurePath()) | ||||
|  | ||||
|     @anoncritical | ||||
|     async def _crawl_path(self, path: Path, pure: PurePath) -> None: | ||||
|         if path.is_dir(): | ||||
|             await self._crawl_dir(path, pure) | ||||
|         elif path.is_file(): | ||||
|             await self._crawl_file(path, pure) | ||||
|  | ||||
|     async def _crawl_dir(self, path: Path, pure: PurePath) -> None: | ||||
|         cl = await self.crawl(pure) | ||||
|         if not cl: | ||||
|             return | ||||
|  | ||||
|         tasks = [] | ||||
|  | ||||
|         async with cl: | ||||
|             await asyncio.sleep(random.uniform( | ||||
|                 0.5 * self._crawl_delay, | ||||
|                 self._crawl_delay, | ||||
|             )) | ||||
|  | ||||
|             for child in path.iterdir(): | ||||
|                 pure_child = cl.path / child.name | ||||
|                 tasks.append(self._crawl_path(child, pure_child)) | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _crawl_file(self, path: Path, pure: PurePath) -> None: | ||||
|         stat = path.stat() | ||||
|         mtime = datetime.datetime.fromtimestamp(stat.st_mtime) | ||||
|         dl = await self.download(pure, mtime=mtime) | ||||
|         if not dl: | ||||
|             return | ||||
|  | ||||
|         async with dl as (bar, sink): | ||||
|             await asyncio.sleep(random.uniform( | ||||
|                 0.5 * self._download_delay, | ||||
|                 self._download_delay, | ||||
|             )) | ||||
|  | ||||
|             bar.set_total(stat.st_size) | ||||
|  | ||||
|             with open(path, "rb") as f: | ||||
|                 while True: | ||||
|                     data = f.read(self._block_size) | ||||
|                     if len(data) == 0: | ||||
|                         break | ||||
|  | ||||
|                     sink.file.write(data) | ||||
|                     bar.advance(len(data)) | ||||
|  | ||||
|                     if self._download_speed: | ||||
|                         delay = self._block_size / self._download_speed | ||||
|                         delay = random.uniform(0.8 * delay, 1.2 * delay) | ||||
|                         await asyncio.sleep(delay) | ||||
|  | ||||
|                 sink.done() | ||||
							
								
								
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,85 @@ | ||||
| from pathlib import PurePath | ||||
| from typing import Iterator, Set | ||||
|  | ||||
| from .logging import log | ||||
| from .utils import fmt_path | ||||
|  | ||||
|  | ||||
| def name_variants(path: PurePath) -> Iterator[PurePath]: | ||||
|     separator = " " if " " in path.stem else "_" | ||||
|     i = 1 | ||||
|     while True: | ||||
|         yield path.parent / f"{path.stem}{separator}{i}{path.suffix}" | ||||
|         i += 1 | ||||
|  | ||||
|  | ||||
| class Deduplicator: | ||||
|     FORBIDDEN_CHARS = '<>:"/\\|?*' | ||||
|     FORBIDDEN_NAMES = { | ||||
|         "CON", "PRN", "AUX", "NUL", | ||||
|         "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", | ||||
|         "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", | ||||
|     } | ||||
|  | ||||
|     def __init__(self, windows_paths: bool) -> None: | ||||
|         self._windows_paths = windows_paths | ||||
|  | ||||
|         self._known: Set[PurePath] = set() | ||||
|  | ||||
|     def _add(self, path: PurePath) -> None: | ||||
|         self._known.add(path) | ||||
|  | ||||
|         # The last parent is just "." | ||||
|         for parent in list(path.parents)[:-1]: | ||||
|             self._known.add(parent) | ||||
|  | ||||
|     def _fixup_element(self, name: str) -> str: | ||||
|         # For historical reasons, windows paths have some odd restrictions that | ||||
|         # we're trying to avoid. See: | ||||
|         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file | ||||
|  | ||||
|         for char in self.FORBIDDEN_CHARS: | ||||
|             name = name.replace(char, "_") | ||||
|  | ||||
|         path = PurePath(name) | ||||
|         if path.stem in self.FORBIDDEN_NAMES: | ||||
|             name = f"{path.stem}_{path.suffix}" | ||||
|  | ||||
|         if name.endswith(" ") or name.endswith("."): | ||||
|             name += "_" | ||||
|  | ||||
|         return name | ||||
|  | ||||
|     def _fixup_for_windows(self, path: PurePath) -> PurePath: | ||||
|         new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts]) | ||||
|         if new_path != path: | ||||
|             log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") | ||||
|         return new_path | ||||
|  | ||||
|     def fixup_path(self, path: PurePath) -> PurePath: | ||||
|         """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise.""" | ||||
|         if self._windows_paths: | ||||
|             return self._fixup_for_windows(path) | ||||
|         return path | ||||
|  | ||||
|     def mark(self, path: PurePath) -> PurePath: | ||||
|         if self._windows_paths: | ||||
|             path = self._fixup_for_windows(path) | ||||
|  | ||||
|         if path not in self._known: | ||||
|             self._add(path) | ||||
|             return path | ||||
|  | ||||
|         log.explain(f"Path {fmt_path(path)} is already taken, finding a new name") | ||||
|  | ||||
|         for variant in name_variants(path): | ||||
|             if variant in self._known: | ||||
|                 log.explain(f"Path {fmt_path(variant)} is taken as well") | ||||
|                 continue | ||||
|  | ||||
|             log.explain(f"Found unused path {fmt_path(variant)}") | ||||
|             self._add(variant) | ||||
|             return variant | ||||
|  | ||||
|         # The "name_variants" iterator returns infinitely many paths | ||||
|         raise RuntimeError("Unreachable") | ||||
							
								
								
									
										169
									
								
								PFERD/diva.py
									
									
									
									
									
								
							
							
						
						
									
										169
									
								
								PFERD/diva.py
									
									
									
									
									
								
							| @@ -1,169 +0,0 @@ | ||||
| """ | ||||
| Utility functions and a scraper/downloader for the KIT DIVA portal. | ||||
| """ | ||||
| import logging | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, List, Optional | ||||
|  | ||||
| import requests | ||||
|  | ||||
| from .errors import FatalException | ||||
| from .logging import PrettyLogger | ||||
| from .organizer import Organizer | ||||
| from .tmp_dir import TmpDir | ||||
| from .transform import Transformable | ||||
| from .utils import stream_to_path | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class DivaDownloadInfo(Transformable): | ||||
|     """ | ||||
|     Information about a DIVA video | ||||
|     """ | ||||
|     url: str | ||||
|  | ||||
|  | ||||
| DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool] | ||||
|  | ||||
|  | ||||
| def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool: | ||||
|     """ | ||||
|     Accepts only new files. | ||||
|     """ | ||||
|     resolved_file = organizer.resolve(info.path) | ||||
|     if not resolved_file.exists(): | ||||
|         return True | ||||
|     PRETTY.ignored_file(info.path, "local file exists") | ||||
|     return False | ||||
|  | ||||
|  | ||||
| class DivaPlaylistCrawler: | ||||
|     # pylint: disable=too-few-public-methods | ||||
|     """ | ||||
|     A crawler for DIVA playlists. | ||||
|     """ | ||||
|  | ||||
|     _PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/" | ||||
|     _COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json" | ||||
|  | ||||
|     def __init__(self, playlist_id: str): | ||||
|         self._id = playlist_id | ||||
|  | ||||
|     @classmethod | ||||
|     def fetch_id(cls, playlist_link: str) -> str: | ||||
|         """ | ||||
|         Fetches the ID for a playerlist, given the base link | ||||
|         (e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271). | ||||
|  | ||||
|         Raises a FatalException, if the id can not be resolved | ||||
|         """ | ||||
|         match = re.match(r".+#/details/(.+)", playlist_link) | ||||
|         if match is None: | ||||
|             raise FatalException( | ||||
|                 "DIVA: Invalid playlist link format, could not extract details." | ||||
|             ) | ||||
|         base_name = match.group(1) | ||||
|  | ||||
|         response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json") | ||||
|  | ||||
|         if response.status_code != 200: | ||||
|             raise FatalException( | ||||
|                 f"DIVA: Got non-200 status code ({response.status_code}))" | ||||
|                 f"when requesting {response.url!r}!" | ||||
|             ) | ||||
|  | ||||
|         body = response.json() | ||||
|  | ||||
|         if body["error"]: | ||||
|             raise FatalException(f"DIVA: Server returned error {body['error']!r}.") | ||||
|  | ||||
|         return body["result"]["collection"]["id"] | ||||
|  | ||||
|     def crawl(self) -> List[DivaDownloadInfo]: | ||||
|         """ | ||||
|         Crawls the playlist given in the constructor. | ||||
|         """ | ||||
|         response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) | ||||
|         if response.status_code != 200: | ||||
|             raise FatalException(f"Server returned status {response.status_code}.") | ||||
|  | ||||
|         body = response.json() | ||||
|  | ||||
|         if body["error"]: | ||||
|             raise FatalException(f"Server returned error {body['error']!r}.") | ||||
|  | ||||
|         result = body["result"] | ||||
|  | ||||
|         if result["resultCount"] > result["pageSize"]: | ||||
|             PRETTY.warning("Did not receive all results, some will be missing") | ||||
|  | ||||
|         download_infos: List[DivaDownloadInfo] = [] | ||||
|  | ||||
|         for video in result["resultList"]: | ||||
|             title = video["title"] | ||||
|             collection_title = self._follow_path(["collection", "title"], video) | ||||
|             url = self._follow_path( | ||||
|                 ["resourceList", "derivateList", "mp4", "url"], | ||||
|                 video | ||||
|             ) | ||||
|  | ||||
|             if url and collection_title and title: | ||||
|                 path = Path(collection_title, title + ".mp4") | ||||
|                 download_infos.append(DivaDownloadInfo(path, url)) | ||||
|             else: | ||||
|                 PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}") | ||||
|  | ||||
|         return download_infos | ||||
|  | ||||
|     @staticmethod | ||||
|     def _follow_path(path: List[str], obj: Any) -> Optional[Any]: | ||||
|         """ | ||||
|         Follows a property path through an object, bailing at the first None. | ||||
|         """ | ||||
|         current = obj | ||||
|         for path_step in path: | ||||
|             if path_step in current: | ||||
|                 current = current[path_step] | ||||
|             else: | ||||
|                 return None | ||||
|         return current | ||||
|  | ||||
|  | ||||
| class DivaDownloader: | ||||
|     """ | ||||
|     A downloader for DIVA videos. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy): | ||||
|         self._tmp_dir = tmp_dir | ||||
|         self._organizer = organizer | ||||
|         self._strategy = strategy | ||||
|         self._session = requests.session() | ||||
|  | ||||
|     def download_all(self, infos: List[DivaDownloadInfo]) -> None: | ||||
|         """ | ||||
|         Download multiple files one after the other. | ||||
|         """ | ||||
|         for info in infos: | ||||
|             self.download(info) | ||||
|  | ||||
|     def download(self, info: DivaDownloadInfo) -> None: | ||||
|         """ | ||||
|         Download a single file. | ||||
|         """ | ||||
|         if not self._strategy(self._organizer, info): | ||||
|             self._organizer.mark(info.path) | ||||
|             return | ||||
|  | ||||
|         with self._session.get(info.url, stream=True) as response: | ||||
|             if response.status_code == 200: | ||||
|                 tmp_file = self._tmp_dir.new_path() | ||||
|                 stream_to_path(response, tmp_file, info.path.name) | ||||
|                 self._organizer.accept_file(tmp_file, info.path) | ||||
|             else: | ||||
|                 PRETTY.warning(f"Could not download file, got response {response.status_code}") | ||||
| @@ -1,69 +0,0 @@ | ||||
| """ | ||||
| Provides a summary that keeps track of new modified or deleted files. | ||||
| """ | ||||
| from pathlib import Path | ||||
| from typing import List | ||||
|  | ||||
|  | ||||
| class DownloadSummary: | ||||
|     """ | ||||
|     Keeps track of all new, modified or deleted files and provides a summary. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self) -> None: | ||||
|         self._new_files: List[Path] = [] | ||||
|         self._modified_files: List[Path] = [] | ||||
|         self._deleted_files: List[Path] = [] | ||||
|  | ||||
|     @property | ||||
|     def new_files(self) -> List[Path]: | ||||
|         """ | ||||
|         Returns all new files. | ||||
|         """ | ||||
|         return self._new_files.copy() | ||||
|  | ||||
|     @property | ||||
|     def modified_files(self) -> List[Path]: | ||||
|         """ | ||||
|         Returns all modified files. | ||||
|         """ | ||||
|         return self._modified_files.copy() | ||||
|  | ||||
|     @property | ||||
|     def deleted_files(self) -> List[Path]: | ||||
|         """ | ||||
|         Returns all deleted files. | ||||
|         """ | ||||
|         return self._deleted_files.copy() | ||||
|  | ||||
|     def merge(self, summary: 'DownloadSummary') -> None: | ||||
|         """ | ||||
|         Merges ourselves with the passed summary. Modifies this object, but not the passed one. | ||||
|         """ | ||||
|         self._new_files = list(set(self._new_files + summary.new_files)) | ||||
|         self._modified_files = list(set(self._modified_files + summary.modified_files)) | ||||
|         self._deleted_files = list(set(self._deleted_files + summary.deleted_files)) | ||||
|  | ||||
|     def add_deleted_file(self, path: Path) -> None: | ||||
|         """ | ||||
|         Registers a file as deleted. | ||||
|         """ | ||||
|         self._deleted_files.append(path) | ||||
|  | ||||
|     def add_modified_file(self, path: Path) -> None: | ||||
|         """ | ||||
|         Registers a file as changed. | ||||
|         """ | ||||
|         self._modified_files.append(path) | ||||
|  | ||||
|     def add_new_file(self, path: Path) -> None: | ||||
|         """ | ||||
|         Registers a file as new. | ||||
|         """ | ||||
|         self._new_files.append(path) | ||||
|  | ||||
|     def has_updates(self) -> bool: | ||||
|         """ | ||||
|         Returns whether this summary has any updates. | ||||
|         """ | ||||
|         return bool(self._new_files or self._modified_files or self._deleted_files) | ||||
| @@ -1,72 +0,0 @@ | ||||
| """ | ||||
| General downloaders useful in many situations | ||||
| """ | ||||
|  | ||||
| from dataclasses import dataclass, field | ||||
| from typing import Any, Dict, List, Optional | ||||
|  | ||||
| import requests | ||||
| import requests.auth | ||||
|  | ||||
| from .organizer import Organizer | ||||
| from .tmp_dir import TmpDir | ||||
| from .transform import Transformable | ||||
| from .utils import stream_to_path | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class HttpDownloadInfo(Transformable): | ||||
|     """ | ||||
|     This class describes a single file to be downloaded. | ||||
|     """ | ||||
|  | ||||
|     url: str | ||||
|     parameters: Dict[str, Any] = field(default_factory=dict) | ||||
|  | ||||
|  | ||||
| class HttpDownloader: | ||||
|     """A HTTP downloader that can handle HTTP basic auth.""" | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             tmp_dir: TmpDir, | ||||
|             organizer: Organizer, | ||||
|             username: Optional[str], | ||||
|             password: Optional[str], | ||||
|     ): | ||||
|         """Create a new http downloader.""" | ||||
|         self._organizer = organizer | ||||
|         self._tmp_dir = tmp_dir | ||||
|         self._username = username | ||||
|         self._password = password | ||||
|         self._session = self._build_session() | ||||
|  | ||||
|     def _build_session(self) -> requests.Session: | ||||
|         session = requests.Session() | ||||
|         if self._username and self._password: | ||||
|             session.auth = requests.auth.HTTPBasicAuth( | ||||
|                 self._username, self._password | ||||
|             ) | ||||
|         return session | ||||
|  | ||||
|     def download_all(self, infos: List[HttpDownloadInfo]) -> None: | ||||
|         """ | ||||
|         Download multiple files one after the other. | ||||
|         """ | ||||
|  | ||||
|         for info in infos: | ||||
|             self.download(info) | ||||
|  | ||||
|     def download(self, info: HttpDownloadInfo) -> None: | ||||
|         """ | ||||
|         Download a single file. | ||||
|         """ | ||||
|  | ||||
|         with self._session.get(info.url, params=info.parameters, stream=True) as response: | ||||
|             if response.status_code == 200: | ||||
|                 tmp_file = self._tmp_dir.new_path() | ||||
|                 stream_to_path(response, tmp_file, info.path.name) | ||||
|                 self._organizer.accept_file(tmp_file, info.path) | ||||
|             else: | ||||
|                 # TODO use proper exception | ||||
|                 raise Exception(f"Could not download file, got response {response.status_code}") | ||||
| @@ -1,39 +0,0 @@ | ||||
| """ | ||||
| An error logging decorator. | ||||
| """ | ||||
|  | ||||
| import logging | ||||
| from typing import Any, Callable, TypeVar, cast | ||||
|  | ||||
| from rich.console import Console | ||||
|  | ||||
| from .logging import PrettyLogger | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| class FatalException(Exception): | ||||
|     """ | ||||
|     A fatal exception occurred. Recovery is not possible. | ||||
|     """ | ||||
|  | ||||
|  | ||||
| TFun = TypeVar('TFun', bound=Callable[..., Any]) | ||||
|  | ||||
|  | ||||
| def swallow_and_print_errors(function: TFun) -> TFun: | ||||
|     """ | ||||
|     Decorates a function, swallows all errors, logs them and returns none if one occurred. | ||||
|     """ | ||||
|     def inner(*args: Any, **kwargs: Any) -> Any: | ||||
|         # pylint: disable=broad-except | ||||
|         try: | ||||
|             return function(*args, **kwargs) | ||||
|         except FatalException as error: | ||||
|             PRETTY.error(str(error)) | ||||
|             return None | ||||
|         except Exception as error: | ||||
|             Console().print_exception() | ||||
|             return None | ||||
|     return cast(TFun, inner) | ||||
| @@ -1,10 +0,0 @@ | ||||
| """ | ||||
| Synchronizing files from ILIAS instances (https://www.ilias.de/). | ||||
| """ | ||||
|  | ||||
| from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator | ||||
| from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, | ||||
|                       IliasElementType) | ||||
| from .downloader import (IliasDownloader, IliasDownloadInfo, | ||||
|                          IliasDownloadStrategy, download_everything, | ||||
|                          download_modified_or_new) | ||||
| @@ -1,131 +0,0 @@ | ||||
| """ | ||||
| Authenticators that can obtain proper ILIAS session cookies. | ||||
| """ | ||||
|  | ||||
| import abc | ||||
| import logging | ||||
| from typing import Optional | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from ..authenticators import TfaAuthenticator, UserPassAuthenticator | ||||
| from ..utils import soupify | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class IliasAuthenticator(abc.ABC): | ||||
|     # pylint: disable=too-few-public-methods | ||||
|  | ||||
|     """ | ||||
|     An authenticator that logs an existing requests session into an ILIAS | ||||
|     account. | ||||
|     """ | ||||
|  | ||||
|     @abc.abstractmethod | ||||
|     def authenticate(self, sess: requests.Session) -> None: | ||||
|         """ | ||||
|         Log a requests session into this authenticator's ILIAS account. | ||||
|         """ | ||||
|  | ||||
|  | ||||
| class KitShibbolethAuthenticator(IliasAuthenticator): | ||||
|     # pylint: disable=too-few-public-methods | ||||
|  | ||||
|     """ | ||||
|     Authenticate via KIT's shibboleth system. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, username: Optional[str] = None, password: Optional[str] = None) -> None: | ||||
|         self._auth = UserPassAuthenticator("KIT ILIAS Shibboleth", username, password) | ||||
|         self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") | ||||
|  | ||||
|     def authenticate(self, sess: requests.Session) -> None: | ||||
|         """ | ||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||
|         cookies it receieves. | ||||
|  | ||||
|         This function should only be called whenever it is detected that you're | ||||
|         not logged in. The cookies obtained should be good for a few minutes, | ||||
|         maybe even an hour or two. | ||||
|         """ | ||||
|  | ||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||
|         # https://ilias.studium.kit.edu/login.php | ||||
|         LOGGER.debug("Begin authentication process with ILIAS") | ||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" | ||||
|         data = { | ||||
|             "sendLogin": "1", | ||||
|             "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", | ||||
|             "target": "/shib_login.php", | ||||
|             "home_organization_selection": "Mit KIT-Account anmelden", | ||||
|         } | ||||
|         soup = soupify(sess.post(url, data=data)) | ||||
|  | ||||
|         # Attempt to login using credentials, if necessary | ||||
|         while not self._login_successful(soup): | ||||
|             # Searching the form here so that this fails before asking for | ||||
|             # credentials rather than after asking. | ||||
|             form = soup.find("form", {"class": "full content", "method": "post"}) | ||||
|             action = form["action"] | ||||
|  | ||||
|             # Equivalent: Enter credentials in | ||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|             LOGGER.debug("Attempt to log in to Shibboleth using credentials") | ||||
|             url = "https://idp.scc.kit.edu" + action | ||||
|             data = { | ||||
|                 "_eventId_proceed": "", | ||||
|                 "j_username": self._auth.username, | ||||
|                 "j_password": self._auth.password, | ||||
|             } | ||||
|             soup = soupify(sess.post(url, data=data)) | ||||
|  | ||||
|             if self._tfa_required(soup): | ||||
|                 soup = self._authenticate_tfa(sess, soup) | ||||
|  | ||||
|             if not self._login_successful(soup): | ||||
|                 print("Incorrect credentials.") | ||||
|                 self._auth.invalidate_credentials() | ||||
|  | ||||
|         # Equivalent: Being redirected via JS automatically | ||||
|         # (or clicking "Continue" if you have JS disabled) | ||||
|         LOGGER.debug("Redirect back to ILIAS with login information") | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" | ||||
|         data = {  # using the info obtained in the while loop above | ||||
|             "RelayState": relay_state["value"], | ||||
|             "SAMLResponse": saml_response["value"], | ||||
|         } | ||||
|         sess.post(url, data=data) | ||||
|  | ||||
|     def _authenticate_tfa( | ||||
|             self, | ||||
|             session: requests.Session, | ||||
|             soup: bs4.BeautifulSoup | ||||
|     ) -> bs4.BeautifulSoup: | ||||
|         # Searching the form here so that this fails before asking for | ||||
|         # credentials rather than after asking. | ||||
|         form = soup.find("form", {"method": "post"}) | ||||
|         action = form["action"] | ||||
|  | ||||
|         # Equivalent: Enter token in | ||||
|         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|         LOGGER.debug("Attempt to log in to Shibboleth with TFA token") | ||||
|         url = "https://idp.scc.kit.edu" + action | ||||
|         data = { | ||||
|             "_eventId_proceed": "", | ||||
|             "j_tokenNumber": self._tfa_auth.get_token() | ||||
|         } | ||||
|         return soupify(session.post(url, data=data)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _login_successful(soup: bs4.BeautifulSoup) -> bool: | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         return relay_state is not None and saml_response is not None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _tfa_required(soup: bs4.BeautifulSoup) -> bool: | ||||
|         return soup.find(id="j_tokenNumber") is not None | ||||
| @@ -1,651 +0,0 @@ | ||||
| """ | ||||
| Contains an ILIAS crawler alongside helper functions. | ||||
| """ | ||||
|  | ||||
| import datetime | ||||
| import json | ||||
| import logging | ||||
| import re | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, Dict, List, Optional, Union | ||||
| from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, | ||||
|                           urlunsplit) | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from ..errors import FatalException | ||||
| from ..logging import PrettyLogger | ||||
| from ..utils import soupify | ||||
| from .authenticators import IliasAuthenticator | ||||
| from .date_demangler import demangle_date | ||||
| from .downloader import IliasDownloadInfo | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| class IliasElementType(Enum): | ||||
|     """ | ||||
|     The type of an ilias element. | ||||
|     """ | ||||
|     REGULAR_FOLDER = "REGULAR_FOLDER" | ||||
|     VIDEO_FOLDER = "VIDEO_FOLDER" | ||||
|     EXERCISE_FOLDER = "EXERCISE_FOLDER" | ||||
|     REGULAR_FILE = "REGULAR_FILE" | ||||
|     VIDEO_FILE = "VIDEO_FILE" | ||||
|     FORUM = "FORUM" | ||||
|     EXTERNAL_LINK = "EXTERNAL_LINK" | ||||
|  | ||||
|     def is_folder(self) -> bool: | ||||
|         """ | ||||
|         Returns whether this type is some kind of folder. | ||||
|         """ | ||||
|         return "FOLDER" in str(self.name) | ||||
|  | ||||
|  | ||||
| IliasDirectoryFilter = Callable[[Path, IliasElementType], bool] | ||||
|  | ||||
|  | ||||
| class IliasCrawlerEntry: | ||||
|     # pylint: disable=too-few-public-methods | ||||
|     """ | ||||
|     An ILIAS crawler entry used internally to find, catalogue and recursively crawl elements. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             path: Path, | ||||
|             url: Union[str, Callable[[], Optional[str]]], | ||||
|             entry_type: IliasElementType, | ||||
|             modification_date: Optional[datetime.datetime] | ||||
|     ): | ||||
|         self.path = path | ||||
|         if isinstance(url, str): | ||||
|             str_url = url | ||||
|             self.url: Callable[[], Optional[str]] = lambda: str_url | ||||
|         else: | ||||
|             self.url = url | ||||
|         self.entry_type = entry_type | ||||
|         self.modification_date = modification_date | ||||
|  | ||||
|     def to_download_info(self) -> Optional[IliasDownloadInfo]: | ||||
|         """ | ||||
|         Converts this crawler entry to an IliasDownloadInfo, if possible. | ||||
|         This method will only succeed for *File* types. | ||||
|         """ | ||||
|         if self.entry_type in [IliasElementType.REGULAR_FILE, IliasElementType.VIDEO_FILE]: | ||||
|             return IliasDownloadInfo(self.path, self.url, self.modification_date) | ||||
|         return None | ||||
|  | ||||
|  | ||||
| class IliasCrawler: | ||||
|     # pylint: disable=too-few-public-methods | ||||
|  | ||||
|     """ | ||||
|     A crawler for ILIAS. | ||||
|     """ | ||||
|  | ||||
|     # pylint: disable=too-many-arguments | ||||
|     def __init__( | ||||
|             self, | ||||
|             base_url: str, | ||||
|             session: requests.Session, | ||||
|             authenticator: IliasAuthenticator, | ||||
|             dir_filter: IliasDirectoryFilter | ||||
|     ): | ||||
|         """ | ||||
|         Create a new ILIAS crawler. | ||||
|         """ | ||||
|  | ||||
|         self._base_url = base_url | ||||
|         self._session = session | ||||
|         self._authenticator = authenticator | ||||
|         self.dir_filter = dir_filter | ||||
|  | ||||
|     @staticmethod | ||||
|     def _url_set_query_param(url: str, param: str, value: str) -> str: | ||||
|         """ | ||||
|         Set a query parameter in an url, overwriting existing ones with the same name. | ||||
|         """ | ||||
|         scheme, netloc, path, query, fragment = urlsplit(url) | ||||
|         query_parameters = parse_qs(query) | ||||
|         query_parameters[param] = [value] | ||||
|         new_query_string = urlencode(query_parameters, doseq=True) | ||||
|  | ||||
|         return urlunsplit((scheme, netloc, path, new_query_string, fragment)) | ||||
|  | ||||
|     def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]: | ||||
|         """ | ||||
|         Crawls a given url *and all reachable elements in it*. | ||||
|  | ||||
|         Args: | ||||
|             url {str} -- the *full* url to crawl | ||||
|         """ | ||||
|         start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url) | ||||
|         return self._iterate_entries_to_download_infos(start_entries) | ||||
|  | ||||
|     def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: | ||||
|         """ | ||||
|         Starts the crawl process for a course, yielding a list of elements to (potentially) | ||||
|         download. | ||||
|  | ||||
|         Arguments: | ||||
|             course_id {str} -- the course id | ||||
|  | ||||
|         Raises: | ||||
|             FatalException: if an unrecoverable error occurs or the course id is not valid | ||||
|         """ | ||||
|         # Start crawling at the given course | ||||
|         root_url = self._url_set_query_param( | ||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" | ||||
|         ) | ||||
|  | ||||
|         if not self._is_course_id_valid(root_url, course_id): | ||||
|             raise FatalException( | ||||
|                 "Invalid course id? I didn't find anything looking like a course!" | ||||
|             ) | ||||
|  | ||||
|         # And treat it as a folder | ||||
|         entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url) | ||||
|         return self._iterate_entries_to_download_infos(entries) | ||||
|  | ||||
|     def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: | ||||
|         response: requests.Response = self._session.get(root_url) | ||||
|         # We were redirected ==> Non-existant ID | ||||
|         if course_id not in response.url: | ||||
|             return False | ||||
|  | ||||
|         link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link") | ||||
|         if not link_element: | ||||
|             return False | ||||
|         # It wasn't a course but a category list, forum, etc. | ||||
|         return "crs_" in link_element.get("value") | ||||
|  | ||||
|     def find_course_name(self, course_id: str) -> Optional[str]: | ||||
|         """ | ||||
|         Returns the name of a given course. None if it is not a valid course | ||||
|         or it could not be found. | ||||
|         """ | ||||
|         course_url = self._url_set_query_param( | ||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" | ||||
|         ) | ||||
|         return self.find_element_name(course_url) | ||||
|  | ||||
|     def find_element_name(self, url: str) -> Optional[str]: | ||||
|         """ | ||||
|         Returns the name of the element at the given URL, if it can find one. | ||||
|         """ | ||||
|         focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus") | ||||
|         if not focus_element: | ||||
|             return None | ||||
|         return focus_element.text | ||||
|  | ||||
|     def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: | ||||
|         """ | ||||
|         Crawls the ILIAS personal desktop (and every subelements that can be reached from there). | ||||
|  | ||||
|         Raises: | ||||
|             FatalException: if an unrecoverable error occurs | ||||
|         """ | ||||
|         entries: List[IliasCrawlerEntry] = self._crawl_folder( | ||||
|             Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI" | ||||
|         ) | ||||
|         return self._iterate_entries_to_download_infos(entries) | ||||
|  | ||||
|     def _iterate_entries_to_download_infos( | ||||
|             self, | ||||
|             entries: List[IliasCrawlerEntry] | ||||
|     ) -> List[IliasDownloadInfo]: | ||||
|         result: List[IliasDownloadInfo] = [] | ||||
|         entries_to_process: List[IliasCrawlerEntry] = entries.copy() | ||||
|         while len(entries_to_process) > 0: | ||||
|             entry = entries_to_process.pop() | ||||
|  | ||||
|             if entry.entry_type == IliasElementType.EXTERNAL_LINK: | ||||
|                 PRETTY.not_searching(entry.path, "external link") | ||||
|                 continue | ||||
|             if entry.entry_type == IliasElementType.FORUM: | ||||
|                 PRETTY.not_searching(entry.path, "forum") | ||||
|                 continue | ||||
|  | ||||
|             if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type): | ||||
|                 PRETTY.not_searching(entry.path, "user filter") | ||||
|                 continue | ||||
|  | ||||
|             download_info = entry.to_download_info() | ||||
|             if download_info is not None: | ||||
|                 result.append(download_info) | ||||
|                 continue | ||||
|  | ||||
|             url = entry.url() | ||||
|  | ||||
|             if url is None: | ||||
|                 PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it") | ||||
|                 continue | ||||
|  | ||||
|             PRETTY.searching(entry.path) | ||||
|  | ||||
|             if entry.entry_type == IliasElementType.EXERCISE_FOLDER: | ||||
|                 entries_to_process += self._crawl_exercises(entry.path, url) | ||||
|                 continue | ||||
|             if entry.entry_type == IliasElementType.REGULAR_FOLDER: | ||||
|                 entries_to_process += self._crawl_folder(entry.path, url) | ||||
|                 continue | ||||
|             if entry.entry_type == IliasElementType.VIDEO_FOLDER: | ||||
|                 entries_to_process += self._crawl_video_directory(entry.path, url) | ||||
|                 continue | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: | ||||
|         """ | ||||
|         Crawl all files in a folder-like element. | ||||
|         """ | ||||
|         soup = self._get_page(url, {}) | ||||
|  | ||||
|         if soup.find(id="headerimage"): | ||||
|             element: bs4.Tag = soup.find(id="headerimage") | ||||
|             if "opencast" in element.attrs["src"].lower(): | ||||
|                 PRETTY.warning(f"Switched to crawling a video at {folder_path}") | ||||
|                 if not self.dir_filter(folder_path, IliasElementType.VIDEO_FOLDER): | ||||
|                     PRETTY.not_searching(folder_path, "user filter") | ||||
|                     return [] | ||||
|                 return self._crawl_video_directory(folder_path, url) | ||||
|  | ||||
|         result: List[IliasCrawlerEntry] = [] | ||||
|  | ||||
|         # Fetch all links and throw them to the general interpreter | ||||
|         links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") | ||||
|         for link in links: | ||||
|             abs_url = self._abs_url_from_link(link) | ||||
|             element_path = Path(folder_path, link.getText().strip()) | ||||
|             element_type = self._find_type_from_link(element_path, link, abs_url) | ||||
|  | ||||
|             if element_type == IliasElementType.REGULAR_FILE: | ||||
|                 result += self._crawl_file(folder_path, link, abs_url) | ||||
|             elif element_type is not None: | ||||
|                 result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] | ||||
|             else: | ||||
|                 PRETTY.warning(f"Found element without a type at {str(element_path)!r}") | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: | ||||
|         """ | ||||
|         Create an absolute url from an <a> tag. | ||||
|         """ | ||||
|         return urljoin(self._base_url, link_tag.get("href")) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_type_from_link( | ||||
|             path: Path, | ||||
|             link_element: bs4.Tag, | ||||
|             url: str | ||||
|     ) -> Optional[IliasElementType]: | ||||
|         """ | ||||
|         Decides which sub crawler to use for a given top level element. | ||||
|         """ | ||||
|         parsed_url = urlparse(url) | ||||
|         LOGGER.debug("Parsed url: %r", parsed_url) | ||||
|  | ||||
|         # file URLs contain "target=file" | ||||
|         if "target=file_" in parsed_url.query: | ||||
|             return IliasElementType.REGULAR_FILE | ||||
|  | ||||
|         # Skip forums | ||||
|         if "cmd=showThreads" in parsed_url.query: | ||||
|             return IliasElementType.FORUM | ||||
|  | ||||
|         # Everything with a ref_id can *probably* be opened to reveal nested things | ||||
|         # video groups, directories, exercises, etc | ||||
|         if "ref_id=" in parsed_url.query: | ||||
|             return IliasCrawler._find_type_from_folder_like(link_element, url) | ||||
|  | ||||
|         PRETTY.warning( | ||||
|             "Got unknown element type in switch. I am not sure what horror I found on the" | ||||
|             f" ILIAS page. The element was at {str(path)!r} and it is {link_element!r})" | ||||
|         ) | ||||
|         return None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_type_from_folder_like(link_element: bs4.Tag, url: str) -> Optional[IliasElementType]: | ||||
|         """ | ||||
|         Try crawling something that looks like a folder. | ||||
|         """ | ||||
|         # pylint: disable=too-many-return-statements | ||||
|  | ||||
|         # We look for the outer div of our inner link, to find information around it | ||||
|         # (mostly the icon) | ||||
|         for parent in link_element.parents: | ||||
|             if "ilContainerListItemOuter" in parent["class"]: | ||||
|                 found_parent = parent | ||||
|                 break | ||||
|  | ||||
|         if found_parent is None: | ||||
|             PRETTY.warning(f"Could not find element icon for {url!r}") | ||||
|             return None | ||||
|  | ||||
|         # Find the small descriptive icon to figure out the type | ||||
|         img_tag: Optional[bs4.Tag] = found_parent.select_one("img.ilListItemIcon") | ||||
|  | ||||
|         if img_tag is None: | ||||
|             PRETTY.warning(f"Could not find image tag for {url!r}") | ||||
|             return None | ||||
|  | ||||
|         if "opencast" in str(img_tag["alt"]).lower(): | ||||
|             return IliasElementType.VIDEO_FOLDER | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_exc.svg"): | ||||
|             return IliasElementType.EXERCISE_FOLDER | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_webr.svg"): | ||||
|             return IliasElementType.EXTERNAL_LINK | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("frm.svg"): | ||||
|             return IliasElementType.FORUM | ||||
|  | ||||
|         return IliasElementType.REGULAR_FOLDER | ||||
|  | ||||
|     @staticmethod | ||||
|     def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasCrawlerEntry]: | ||||
|         """ | ||||
|         Crawls a file. | ||||
|         """ | ||||
|         # Files have a list of properties (type, modification date, size, etc.) | ||||
|         # In a series of divs. | ||||
|         # Find the parent containing all those divs, so we can filter our what we need | ||||
|         properties_parent: bs4.Tag = link_element.findParent( | ||||
|             "div", {"class": lambda x: "il_ContainerListItem" in x} | ||||
|         ).select_one(".il_ItemProperties") | ||||
|         # The first one is always the filetype | ||||
|         file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() | ||||
|  | ||||
|         # The rest does not have a stable order. Grab the whole text and reg-ex the date | ||||
|         # out of it | ||||
|         all_properties_text = properties_parent.getText().strip() | ||||
|         modification_date_match = re.search( | ||||
|             r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", | ||||
|             all_properties_text | ||||
|         ) | ||||
|         if modification_date_match is None: | ||||
|             modification_date = None | ||||
|             PRETTY.warning(f"Could not extract start date from {all_properties_text!r}") | ||||
|         else: | ||||
|             modification_date_str = modification_date_match.group(1) | ||||
|             modification_date = demangle_date(modification_date_str) | ||||
|  | ||||
|         # Grab the name from the link text | ||||
|         name = link_element.getText() | ||||
|         full_path = Path(path, name + "." + file_type) | ||||
|  | ||||
|         return [ | ||||
|             IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date) | ||||
|         ] | ||||
|  | ||||
|     def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]: | ||||
|         """ | ||||
|         Crawl the video overview site. | ||||
|         """ | ||||
|         initial_soup = self._get_page(url, {}) | ||||
|  | ||||
|         # The page is actually emtpy but contains a much needed token in the link below. | ||||
|         # That token can be used to fetch the *actual* video listing | ||||
|         content_link: bs4.Tag = initial_soup.select_one("#tab_series a") | ||||
|         # Fetch the actual video listing. The given parameters return all videos (max 800) | ||||
|         # in a standalone html page | ||||
|         video_list_soup = self._get_page( | ||||
|             self._abs_url_from_link(content_link), | ||||
|             {"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||
|         ) | ||||
|  | ||||
|         # If we find a page selected, we probably need to respect pagination | ||||
|         if self._is_paginated_video_page(video_list_soup): | ||||
|             second_stage_url = self._abs_url_from_link(content_link) | ||||
|  | ||||
|             return self._crawl_paginated_video_directory( | ||||
|                 video_dir_path, video_list_soup, second_stage_url | ||||
|             ) | ||||
|  | ||||
|         return self._crawl_video_directory_second_stage(video_dir_path, video_list_soup) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool: | ||||
|         return soup.find(id=re.compile(r"tab_page_sel.+")) is not None | ||||
|  | ||||
|     def _crawl_paginated_video_directory( | ||||
|             self, | ||||
|             video_dir_path: Path, | ||||
|             paged_video_list_soup: bs4.BeautifulSoup, | ||||
|             second_stage_url: str | ||||
|     ) -> List[IliasCrawlerEntry]: | ||||
|         LOGGER.info("Found paginated video page, trying 800 elements") | ||||
|  | ||||
|         # Try to find the table id. This can be used to build the query parameter indicating | ||||
|         # you want 800 elements | ||||
|  | ||||
|         table_element: bs4.Tag = paged_video_list_soup.find( | ||||
|             name="table", id=re.compile(r"tbl_xoct_.+") | ||||
|         ) | ||||
|         if table_element is None: | ||||
|             PRETTY.warning( | ||||
|                 "Could not increase elements per page (table not found)." | ||||
|                 " Some might not be crawled!" | ||||
|             ) | ||||
|             return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) | ||||
|  | ||||
|         match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) | ||||
|         if match is None: | ||||
|             PRETTY.warning( | ||||
|                 "Could not increase elements per page (table id not found)." | ||||
|                 " Some might not be crawled!" | ||||
|             ) | ||||
|             return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) | ||||
|         table_id = match.group(1) | ||||
|  | ||||
|         extended_video_page = self._get_page( | ||||
|             second_stage_url, | ||||
|             {f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||
|         ) | ||||
|  | ||||
|         if self._is_paginated_video_page(extended_video_page): | ||||
|             PRETTY.warning( | ||||
|                 "800 elements do not seem to be enough (or I failed to fetch that many)." | ||||
|                 " I will miss elements." | ||||
|             ) | ||||
|  | ||||
|         return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page) | ||||
|  | ||||
|     def _crawl_video_directory_second_stage( | ||||
|             self, | ||||
|             video_dir_path: Path, | ||||
|             video_list_soup: bs4.BeautifulSoup | ||||
|     ) -> List[IliasCrawlerEntry]: | ||||
|         """ | ||||
|         Crawls the "second stage" video page. This page contains the actual video urls. | ||||
|         """ | ||||
|         direct_download_links: List[bs4.Tag] = video_list_soup.findAll( | ||||
|             name="a", text=re.compile(r"\s*Download\s*") | ||||
|         ) | ||||
|  | ||||
|         # Video start links are marked with an "Abspielen" link | ||||
|         video_links: List[bs4.Tag] = video_list_soup.findAll( | ||||
|             name="a", text=re.compile(r"\s*Abspielen\s*") | ||||
|         ) | ||||
|  | ||||
|         results: List[IliasCrawlerEntry] = [] | ||||
|  | ||||
|         # We can download everything directly! | ||||
|         # FIXME: Sadly the download button is currently broken, so never do that | ||||
|         if False and len(direct_download_links) == len(video_links): | ||||
|             for link in direct_download_links: | ||||
|                 results += self._crawl_single_video(video_dir_path, link, True) | ||||
|         else: | ||||
|             for link in video_links: | ||||
|                 results += self._crawl_single_video(video_dir_path, link, False) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _crawl_single_video( | ||||
|             self, | ||||
|             parent_path: Path, | ||||
|             link: bs4.Tag, | ||||
|             direct_download: bool | ||||
|     ) -> List[IliasCrawlerEntry]: | ||||
|         """ | ||||
|         Crawl a single video based on its "Abspielen" link from the video listing. | ||||
|         """ | ||||
|         # The link is part of a table with multiple columns, describing metadata. | ||||
|         # 6th child (1 indexed) is the modification time string | ||||
|         modification_string = link.parent.parent.parent.select_one( | ||||
|             "td.std:nth-child(6)" | ||||
|         ).getText().strip() | ||||
|         modification_time = datetime.datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") | ||||
|  | ||||
|         title = link.parent.parent.parent.select_one( | ||||
|             "td.std:nth-child(3)" | ||||
|         ).getText().strip() | ||||
|         title += ".mp4" | ||||
|  | ||||
|         video_path: Path = Path(parent_path, title) | ||||
|  | ||||
|         video_url = self._abs_url_from_link(link) | ||||
|  | ||||
|         # The video had a direct download button we can use instead | ||||
|         if direct_download: | ||||
|             LOGGER.debug("Using direct download for video %r", str(video_path)) | ||||
|             return [IliasCrawlerEntry( | ||||
|                 video_path, video_url, IliasElementType.VIDEO_FILE, modification_time | ||||
|             )] | ||||
|  | ||||
|         return [IliasCrawlerEntry( | ||||
|             video_path, | ||||
|             self._crawl_video_url_from_play_link(video_url), | ||||
|             IliasElementType.VIDEO_FILE, | ||||
|             modification_time | ||||
|         )] | ||||
|  | ||||
|     def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]: | ||||
|         def inner() -> Optional[str]: | ||||
|             # Fetch the actual video page. This is a small wrapper page initializing a javscript | ||||
|             # player. Sadly we can not execute that JS. The actual video stream url is nowhere | ||||
|             # on the page, but defined in a JS object inside a script tag, passed to the player | ||||
|             # library. | ||||
|             # We do the impossible and RegEx the stream JSON object out of the page's HTML source | ||||
|             video_page_soup = soupify(self._session.get(play_url)) | ||||
|             regex: re.Pattern = re.compile( | ||||
|                 r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE | ||||
|             ) | ||||
|             json_match = regex.search(str(video_page_soup)) | ||||
|  | ||||
|             if json_match is None: | ||||
|                 PRETTY.warning(f"Could not find json stream info for {play_url!r}") | ||||
|                 return None | ||||
|             json_str = json_match.group(1) | ||||
|  | ||||
|             # parse it | ||||
|             json_object = json.loads(json_str) | ||||
|             # and fetch the video url! | ||||
|             video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] | ||||
|             return video_url | ||||
|         return inner | ||||
|  | ||||
|     def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]: | ||||
|         """ | ||||
|         Crawl files offered for download in exercises. | ||||
|         """ | ||||
|         soup = self._get_page(url, {}) | ||||
|  | ||||
|         results: List[IliasCrawlerEntry] = [] | ||||
|  | ||||
|         # Each assignment is in an accordion container | ||||
|         assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer") | ||||
|  | ||||
|         for container in assignment_containers: | ||||
|             # Fetch the container name out of the header to use it in the path | ||||
|             container_name = container.select_one(".ilAssignmentHeader").getText().strip() | ||||
|             # Find all download links in the container (this will contain all the files) | ||||
|             files: List[bs4.Tag] = container.findAll( | ||||
|                 name="a", | ||||
|                 # download links contain the given command class | ||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, | ||||
|                 text="Download" | ||||
|             ) | ||||
|  | ||||
|             LOGGER.debug("Found exercise container %r", container_name) | ||||
|  | ||||
|             # Grab each file as you now have the link | ||||
|             for file_link in files: | ||||
|                 # Two divs, side by side. Left is the name, right is the link ==> get left | ||||
|                 # sibling | ||||
|                 file_name = file_link.parent.findPrevious(name="div").getText().strip() | ||||
|                 url = self._abs_url_from_link(file_link) | ||||
|  | ||||
|                 LOGGER.debug("Found file %r at %r", file_name, url) | ||||
|  | ||||
|                 results.append(IliasCrawlerEntry( | ||||
|                     Path(element_path, container_name, file_name), | ||||
|                     url, | ||||
|                     IliasElementType.REGULAR_FILE, | ||||
|                     None  # We do not have any timestamp | ||||
|                 )) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _get_page(self, url: str, params: Dict[str, Any], | ||||
|                   retry_count: int = 0) -> bs4.BeautifulSoup: | ||||
|         """ | ||||
|         Fetches a page from ILIAS, authenticating when needed. | ||||
|         """ | ||||
|  | ||||
|         if retry_count >= 4: | ||||
|             raise FatalException("Could not get a proper page after 4 tries. " | ||||
|                                  "Maybe your URL is wrong, authentication fails continuously, " | ||||
|                                  "your ILIAS connection is spotty or ILIAS is not well.") | ||||
|  | ||||
|         LOGGER.debug("Fetching %r", url) | ||||
|  | ||||
|         response = self._session.get(url, params=params) | ||||
|         content_type = response.headers["content-type"] | ||||
|  | ||||
|         if not content_type.startswith("text/html"): | ||||
|             raise FatalException( | ||||
|                 f"Invalid content type {content_type} when crawling ilias page" | ||||
|                 " {url!r} with {params!r}" | ||||
|             ) | ||||
|  | ||||
|         soup = soupify(response) | ||||
|  | ||||
|         if self._is_logged_in(soup): | ||||
|             return soup | ||||
|  | ||||
|         LOGGER.info("Not authenticated, changing that...") | ||||
|  | ||||
|         self._authenticator.authenticate(self._session) | ||||
|  | ||||
|         return self._get_page(url, params, retry_count + 1) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: | ||||
|         # Normal ILIAS pages | ||||
|         userlog = soup.find("li", {"id": "userlog"}) | ||||
|         if userlog is not None: | ||||
|             LOGGER.debug("Auth: Found #userlog") | ||||
|             return True | ||||
|         # Video listing embeds do not have complete ILIAS html. Try to match them by | ||||
|         # their video listing table | ||||
|         video_table = soup.find( | ||||
|             recursive=True, | ||||
|             name="table", | ||||
|             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} | ||||
|         ) | ||||
|         if video_table is not None: | ||||
|             LOGGER.debug("Auth: Found #tbl_xoct.+") | ||||
|             return True | ||||
|         # The individual video player wrapper page has nothing of the above. | ||||
|         # Match it by its playerContainer. | ||||
|         if soup.select_one("#playerContainer") is not None: | ||||
|             LOGGER.debug("Auth: Found #playerContainer") | ||||
|             return True | ||||
|         return False | ||||
| @@ -1,51 +0,0 @@ | ||||
| """ | ||||
| Helper methods to demangle an ILIAS date. | ||||
| """ | ||||
|  | ||||
| import datetime | ||||
| import locale | ||||
| import logging | ||||
| import re | ||||
| from typing import Optional | ||||
|  | ||||
| from ..logging import PrettyLogger | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| def demangle_date(date: str) -> Optional[datetime.datetime]: | ||||
|     """ | ||||
|     Demangle a given date in one of the following formats: | ||||
|     "Gestern, HH:MM" | ||||
|     "Heute, HH:MM" | ||||
|     "Morgen, HH:MM" | ||||
|     "dd. mon.yyyy, HH:MM | ||||
|     """ | ||||
|     saved = locale.setlocale(locale.LC_ALL) | ||||
|     try: | ||||
|         try: | ||||
|             locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') | ||||
|         except locale.Error: | ||||
|             PRETTY.warning( | ||||
|                 "Could not set language to german. Assuming you use english everywhere." | ||||
|             ) | ||||
|  | ||||
|         date = re.sub(r"\s+", " ", date) | ||||
|         date = re.sub("Gestern|Yesterday", _yesterday().strftime("%d. %b %Y"), date, re.I) | ||||
|         date = re.sub("Heute|Today", datetime.date.today().strftime("%d. %b %Y"), date, re.I) | ||||
|         date = re.sub("Morgen|Tomorrow", _tomorrow().strftime("%d. %b %Y"), date, re.I) | ||||
|         return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M") | ||||
|     except ValueError: | ||||
|         PRETTY.warning(f"Could not parse date {date!r}") | ||||
|         return None | ||||
|     finally: | ||||
|         locale.setlocale(locale.LC_ALL, saved) | ||||
|  | ||||
|  | ||||
| def _yesterday() -> datetime.date: | ||||
|     return datetime.date.today() - datetime.timedelta(days=1) | ||||
|  | ||||
|  | ||||
| def _tomorrow() -> datetime.date: | ||||
|     return datetime.date.today() + datetime.timedelta(days=1) | ||||
| @@ -1,162 +0,0 @@ | ||||
| """Contains a downloader for ILIAS.""" | ||||
|  | ||||
| import datetime | ||||
| import logging | ||||
| import math | ||||
| import os | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Callable, List, Optional, Union | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from ..logging import PrettyLogger | ||||
| from ..organizer import Organizer | ||||
| from ..tmp_dir import TmpDir | ||||
| from ..transform import Transformable | ||||
| from ..utils import soupify, stream_to_path | ||||
| from .authenticators import IliasAuthenticator | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| class ContentTypeException(Exception): | ||||
|     """Thrown when the content type of the ilias element can not be handled.""" | ||||
|  | ||||
|  | ||||
| class IliasDownloadInfo(Transformable): | ||||
|     """ | ||||
|     This class describes a single file to be downloaded. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             path: PurePath, | ||||
|             url: Union[str, Callable[[], Optional[str]]], | ||||
|             modifcation_date: Optional[datetime.datetime] | ||||
|     ): | ||||
|         super().__init__(path) | ||||
|         if isinstance(url, str): | ||||
|             string_url = url | ||||
|             self.url: Callable[[], Optional[str]] = lambda: string_url | ||||
|         else: | ||||
|             self.url = url | ||||
|         self.modification_date = modifcation_date | ||||
|  | ||||
|  | ||||
| IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool] | ||||
|  | ||||
|  | ||||
| def download_everything(organizer: Organizer, info: IliasDownloadInfo) -> bool: | ||||
|     # pylint: disable=unused-argument | ||||
|     """ | ||||
|     Accepts everything. | ||||
|     """ | ||||
|     return True | ||||
|  | ||||
|  | ||||
| def download_modified_or_new(organizer: Organizer, info: IliasDownloadInfo) -> bool: | ||||
|     """ | ||||
|     Accepts new files or files with a more recent modification date. | ||||
|     """ | ||||
|     resolved_file = organizer.resolve(info.path) | ||||
|     if not resolved_file.exists() or info.modification_date is None: | ||||
|         return True | ||||
|     resolved_mod_time_seconds = resolved_file.stat().st_mtime | ||||
|  | ||||
|     # Download if the info is newer | ||||
|     if info.modification_date.timestamp() > resolved_mod_time_seconds: | ||||
|         return True | ||||
|  | ||||
|     PRETTY.ignored_file(info.path, "local file has newer or equal modification time") | ||||
|     return False | ||||
|  | ||||
|  | ||||
| class IliasDownloader: | ||||
|     # pylint: disable=too-many-arguments | ||||
|     """A downloader for ILIAS.""" | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             tmp_dir: TmpDir, | ||||
|             organizer: Organizer, | ||||
|             session: requests.Session, | ||||
|             authenticator: IliasAuthenticator, | ||||
|             strategy: IliasDownloadStrategy, | ||||
|             timeout: int = 5 | ||||
|     ): | ||||
|         """ | ||||
|         Create a new IliasDownloader. | ||||
|  | ||||
|         The timeout applies to the download request only, as bwcloud uses IPv6 | ||||
|         and requests has a problem with that: https://github.com/psf/requests/issues/5522 | ||||
|         """ | ||||
|  | ||||
|         self._tmp_dir = tmp_dir | ||||
|         self._organizer = organizer | ||||
|         self._session = session | ||||
|         self._authenticator = authenticator | ||||
|         self._strategy = strategy | ||||
|         self._timeout = timeout | ||||
|  | ||||
|     def download_all(self, infos: List[IliasDownloadInfo]) -> None: | ||||
|         """ | ||||
|         Download multiple files one after the other. | ||||
|         """ | ||||
|  | ||||
|         for info in infos: | ||||
|             self.download(info) | ||||
|  | ||||
|     def download(self, info: IliasDownloadInfo) -> None: | ||||
|         """ | ||||
|         Download a file from ILIAS. | ||||
|  | ||||
|         Retries authentication until eternity if it could not fetch the file. | ||||
|         """ | ||||
|  | ||||
|         LOGGER.debug("Downloading %r", info) | ||||
|         if not self._strategy(self._organizer, info): | ||||
|             self._organizer.mark(info.path) | ||||
|             return | ||||
|  | ||||
|         tmp_file = self._tmp_dir.new_path() | ||||
|  | ||||
|         while not self._try_download(info, tmp_file): | ||||
|             LOGGER.info("Retrying download: %r", info) | ||||
|             self._authenticator.authenticate(self._session) | ||||
|  | ||||
|         dst_path = self._organizer.accept_file(tmp_file, info.path) | ||||
|         if dst_path and info.modification_date: | ||||
|             os.utime( | ||||
|                 dst_path, | ||||
|                 times=( | ||||
|                     math.ceil(info.modification_date.timestamp()), | ||||
|                     math.ceil(info.modification_date.timestamp()) | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: | ||||
|         url = info.url() | ||||
|         if url is None: | ||||
|             PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") | ||||
|             return True | ||||
|  | ||||
|         with self._session.get(url, stream=True, timeout=self._timeout) as response: | ||||
|             content_type = response.headers["content-type"] | ||||
|             has_content_disposition = "content-disposition" in response.headers | ||||
|  | ||||
|             if content_type.startswith("text/html") and not has_content_disposition: | ||||
|                 if self._is_logged_in(soupify(response)): | ||||
|                     raise ContentTypeException("Attempting to download a web page, not a file") | ||||
|  | ||||
|                 return False | ||||
|  | ||||
|             # Yay, we got the file :) | ||||
|             stream_to_path(response, target, info.path.name) | ||||
|             return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: | ||||
|         userlog = soup.find("li", {"id": "userlog"}) | ||||
|         return userlog is not None | ||||
							
								
								
									
										151
									
								
								PFERD/ipd.py
									
									
									
									
									
								
							
							
						
						
									
										151
									
								
								PFERD/ipd.py
									
									
									
									
									
								
							| @@ -1,151 +0,0 @@ | ||||
| """ | ||||
| Utility functions and a scraper/downloader for the IPD pages. | ||||
| """ | ||||
| import datetime | ||||
| import logging | ||||
| import math | ||||
| import os | ||||
| from dataclasses import dataclass | ||||
| from pathlib import Path | ||||
| from typing import Callable, List, Optional | ||||
| from urllib.parse import urljoin | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from PFERD.errors import FatalException | ||||
| from PFERD.utils import soupify | ||||
|  | ||||
| from .logging import PrettyLogger | ||||
| from .organizer import Organizer | ||||
| from .tmp_dir import TmpDir | ||||
| from .transform import Transformable | ||||
| from .utils import stream_to_path | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IpdDownloadInfo(Transformable): | ||||
|     """ | ||||
|     Information about an ipd entry. | ||||
|     """ | ||||
|     url: str | ||||
|     modification_date: Optional[datetime.datetime] | ||||
|  | ||||
|  | ||||
| IpdDownloadStrategy = Callable[[Organizer, IpdDownloadInfo], bool] | ||||
|  | ||||
|  | ||||
| def ipd_download_new_or_modified(organizer: Organizer, info: IpdDownloadInfo) -> bool: | ||||
|     """ | ||||
|     Accepts new files or files with a more recent modification date. | ||||
|     """ | ||||
|     resolved_file = organizer.resolve(info.path) | ||||
|     if not resolved_file.exists(): | ||||
|         return True | ||||
|     if not info.modification_date: | ||||
|         PRETTY.ignored_file(info.path, "could not find modification time, file exists") | ||||
|         return False | ||||
|  | ||||
|     resolved_mod_time_seconds = resolved_file.stat().st_mtime | ||||
|  | ||||
|     # Download if the info is newer | ||||
|     if info.modification_date.timestamp() > resolved_mod_time_seconds: | ||||
|         return True | ||||
|  | ||||
|     PRETTY.ignored_file(info.path, "local file has newer or equal modification time") | ||||
|     return False | ||||
|  | ||||
|  | ||||
| class IpdCrawler: | ||||
|     # pylint: disable=too-few-public-methods | ||||
|     """ | ||||
|     A crawler for IPD pages. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, base_url: str): | ||||
|         self._base_url = base_url | ||||
|  | ||||
|     def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: | ||||
|         """ | ||||
|         Create an absolute url from an <a> tag. | ||||
|         """ | ||||
|         return urljoin(self._base_url, link_tag.get("href")) | ||||
|  | ||||
|     def crawl(self) -> List[IpdDownloadInfo]: | ||||
|         """ | ||||
|         Crawls the playlist given in the constructor. | ||||
|         """ | ||||
|         page = soupify(requests.get(self._base_url)) | ||||
|  | ||||
|         items: List[IpdDownloadInfo] = [] | ||||
|  | ||||
|         for link in page.findAll(name="a", attrs={"href": lambda x: x and x.endswith("pdf")}): | ||||
|             href: str = link.attrs.get("href") | ||||
|             name = href.split("/")[-1] | ||||
|  | ||||
|             modification_date: Optional[datetime.datetime] = None | ||||
|             try: | ||||
|                 enclosing_row: bs4.Tag = link.findParent(name="tr") | ||||
|                 if enclosing_row: | ||||
|                     date_text = enclosing_row.find(name="td").text | ||||
|                     modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") | ||||
|             except ValueError: | ||||
|                 modification_date = None | ||||
|  | ||||
|             items.append(IpdDownloadInfo( | ||||
|                 Path(name), | ||||
|                 url=self._abs_url_from_link(link), | ||||
|                 modification_date=modification_date | ||||
|             )) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|  | ||||
| class IpdDownloader: | ||||
|     """ | ||||
|     A downloader for ipd files. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy): | ||||
|         self._tmp_dir = tmp_dir | ||||
|         self._organizer = organizer | ||||
|         self._strategy = strategy | ||||
|         self._session = requests.session() | ||||
|  | ||||
|     def download_all(self, infos: List[IpdDownloadInfo]) -> None: | ||||
|         """ | ||||
|         Download multiple files one after the other. | ||||
|         """ | ||||
|         for info in infos: | ||||
|             self.download(info) | ||||
|  | ||||
|     def download(self, info: IpdDownloadInfo) -> None: | ||||
|         """ | ||||
|         Download a single file. | ||||
|         """ | ||||
|         if not self._strategy(self._organizer, info): | ||||
|             self._organizer.mark(info.path) | ||||
|             return | ||||
|  | ||||
|         with self._session.get(info.url, stream=True) as response: | ||||
|             if response.status_code == 200: | ||||
|                 tmp_file = self._tmp_dir.new_path() | ||||
|                 stream_to_path(response, tmp_file, info.path.name) | ||||
|                 dst_path = self._organizer.accept_file(tmp_file, info.path) | ||||
|  | ||||
|                 if dst_path and info.modification_date: | ||||
|                     os.utime( | ||||
|                         dst_path, | ||||
|                         times=( | ||||
|                             math.ceil(info.modification_date.timestamp()), | ||||
|                             math.ceil(info.modification_date.timestamp()) | ||||
|                         ) | ||||
|                     ) | ||||
|  | ||||
|             elif response.status_code == 403: | ||||
|                 raise FatalException("Received 403. Are you not using the KIT VPN?") | ||||
|             else: | ||||
|                 PRETTY.warning(f"Could not download file, got response {response.status_code}") | ||||
							
								
								
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| import asyncio | ||||
| import time | ||||
| from contextlib import asynccontextmanager | ||||
| from dataclasses import dataclass | ||||
| from typing import AsyncIterator, Optional | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Slot: | ||||
|     active: bool = False | ||||
|     last_left: Optional[float] = None | ||||
|  | ||||
|  | ||||
| class Limiter: | ||||
|     def __init__( | ||||
|             self, | ||||
|             task_limit: int, | ||||
|             download_limit: int, | ||||
|             task_delay: float | ||||
|     ): | ||||
|         if task_limit <= 0: | ||||
|             raise ValueError("task limit must be at least 1") | ||||
|         if download_limit <= 0: | ||||
|             raise ValueError("download limit must be at least 1") | ||||
|         if download_limit > task_limit: | ||||
|             raise ValueError("download limit can't be greater than task limit") | ||||
|         if task_delay < 0: | ||||
|             raise ValueError("Task delay must not be negative") | ||||
|  | ||||
|         self._slots = [Slot() for _ in range(task_limit)] | ||||
|         self._downloads = download_limit | ||||
|         self._delay = task_delay | ||||
|  | ||||
|         self._condition = asyncio.Condition() | ||||
|  | ||||
|     def _acquire_slot(self) -> Optional[Slot]: | ||||
|         for slot in self._slots: | ||||
|             if not slot.active: | ||||
|                 slot.active = True | ||||
|                 return slot | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     async def _wait_for_slot_delay(self, slot: Slot) -> None: | ||||
|         if slot.last_left is not None: | ||||
|             delay = slot.last_left + self._delay - time.time() | ||||
|             if delay > 0: | ||||
|                 await asyncio.sleep(delay) | ||||
|  | ||||
|     def _release_slot(self, slot: Slot) -> None: | ||||
|         slot.last_left = time.time() | ||||
|         slot.active = False | ||||
|  | ||||
|     @asynccontextmanager | ||||
|     async def limit_crawl(self) -> AsyncIterator[None]: | ||||
|         slot: Slot | ||||
|         async with self._condition: | ||||
|             while True: | ||||
|                 if found_slot := self._acquire_slot(): | ||||
|                     slot = found_slot | ||||
|                     break | ||||
|                 await self._condition.wait() | ||||
|  | ||||
|         await self._wait_for_slot_delay(slot) | ||||
|  | ||||
|         try: | ||||
|             yield | ||||
|         finally: | ||||
|             async with self._condition: | ||||
|                 self._release_slot(slot) | ||||
|                 self._condition.notify_all() | ||||
|  | ||||
|     @asynccontextmanager | ||||
|     async def limit_download(self) -> AsyncIterator[None]: | ||||
|         slot: Slot | ||||
|         async with self._condition: | ||||
|             while True: | ||||
|                 if self._downloads <= 0: | ||||
|                     await self._condition.wait() | ||||
|                     continue | ||||
|  | ||||
|                 if found_slot := self._acquire_slot(): | ||||
|                     slot = found_slot | ||||
|                     self._downloads -= 1 | ||||
|                     break | ||||
|  | ||||
|                 await self._condition.wait() | ||||
|  | ||||
|         await self._wait_for_slot_delay(slot) | ||||
|  | ||||
|         try: | ||||
|             yield | ||||
|         finally: | ||||
|             async with self._condition: | ||||
|                 self._release_slot(slot) | ||||
|                 self._downloads += 1 | ||||
|                 self._condition.notify_all() | ||||
| @@ -1,41 +0,0 @@ | ||||
| """ | ||||
| Contains a Location class for objects with an inherent path. | ||||
| """ | ||||
|  | ||||
| from pathlib import Path, PurePath | ||||
|  | ||||
|  | ||||
| class ResolveException(Exception): | ||||
|     """An exception while resolving a file.""" | ||||
|     # TODO take care of this when doing exception handling | ||||
|  | ||||
|  | ||||
| class Location: | ||||
|     """ | ||||
|     An object that has an inherent path. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: Path): | ||||
|         self._path = path.resolve() | ||||
|  | ||||
|     @property | ||||
|     def path(self) -> Path: | ||||
|         """ | ||||
|         This object's location. | ||||
|         """ | ||||
|  | ||||
|         return self._path | ||||
|  | ||||
|     def resolve(self, target: PurePath) -> Path: | ||||
|         """ | ||||
|         Resolve a file relative to the path of this location. | ||||
|  | ||||
|         Raises a [ResolveException] if the file is outside the given directory. | ||||
|         """ | ||||
|         absolute_path = self.path.joinpath(target).resolve() | ||||
|  | ||||
|         # TODO Make this less inefficient | ||||
|         if self.path not in absolute_path.parents: | ||||
|             raise ResolveException(f"Path {target} is not inside directory {self.path}") | ||||
|  | ||||
|         return absolute_path | ||||
							
								
								
									
										360
									
								
								PFERD/logging.py
									
									
									
									
									
								
							
							
						
						
									
										360
									
								
								PFERD/logging.py
									
									
									
									
									
								
							| @@ -1,187 +1,271 @@ | ||||
| """ | ||||
| Contains a few logger utility functions and implementations. | ||||
| """ | ||||
| import asyncio | ||||
| import sys | ||||
| import traceback | ||||
| from contextlib import asynccontextmanager, contextmanager | ||||
| # TODO In Python 3.9 and above, ContextManager is deprecated | ||||
| from typing import AsyncIterator, ContextManager, Iterator, List, Optional | ||||
|  | ||||
| import logging | ||||
| from pathlib import Path | ||||
| from typing import List, Optional | ||||
|  | ||||
| from rich import print as rich_print | ||||
| from rich._log_render import LogRender | ||||
| from rich.console import Console | ||||
| from rich.console import Console, Group | ||||
| from rich.live import Live | ||||
| from rich.markup import escape | ||||
| from rich.panel import Panel | ||||
| from rich.style import Style | ||||
| from rich.text import Text | ||||
| from rich.theme import Theme | ||||
|  | ||||
| from .download_summary import DownloadSummary | ||||
| from .utils import PathLike, to_path | ||||
|  | ||||
| STYLE = "{" | ||||
| FORMAT = "[{levelname:<7}] {message}" | ||||
| DATE_FORMAT = "%F %T" | ||||
| from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, | ||||
|                            TransferSpeedColumn) | ||||
| from rich.table import Column | ||||
|  | ||||
|  | ||||
| def enable_logging(name: str = "PFERD", level: int = logging.INFO) -> None: | ||||
|     """ | ||||
|     Enable and configure logging via the logging module. | ||||
|     """ | ||||
| class ProgressBar: | ||||
|     def __init__(self, progress: Progress, taskid: TaskID): | ||||
|         self._progress = progress | ||||
|         self._taskid = taskid | ||||
|  | ||||
|     logger = logging.getLogger(name) | ||||
|     logger.setLevel(level) | ||||
|     logger.addHandler(RichLoggingHandler(level=level)) | ||||
|     def advance(self, amount: float = 1) -> None: | ||||
|         self._progress.advance(self._taskid, advance=amount) | ||||
|  | ||||
|     # This should be logged by our own handler, and not the root logger's | ||||
|     # default handler, so we don't pass it on to the root logger. | ||||
|     logger.propagate = False | ||||
|     def set_total(self, total: float) -> None: | ||||
|         self._progress.update(self._taskid, total=total) | ||||
|         self._progress.start_task(self._taskid) | ||||
|  | ||||
|  | ||||
| class RichLoggingHandler(logging.Handler): | ||||
|     """ | ||||
|     A logging handler that uses rich for highlighting | ||||
|     """ | ||||
| class Log: | ||||
|     STATUS_WIDTH = 11 | ||||
|  | ||||
|     def __init__(self, level: int) -> None: | ||||
|         super().__init__(level=level) | ||||
|         self.console = Console(theme=Theme({ | ||||
|             "logging.level.warning": Style(color="yellow") | ||||
|         })) | ||||
|         self._log_render = LogRender(show_level=True, show_time=False, show_path=False) | ||||
|     def __init__(self) -> None: | ||||
|         self.console = Console(highlight=False) | ||||
|  | ||||
|     def emit(self, record: logging.LogRecord) -> None: | ||||
|         """ | ||||
|         Invoked by logging. | ||||
|         """ | ||||
|         log_style = f"logging.level.{record.levelname.lower()}" | ||||
|         message = self.format(record) | ||||
|  | ||||
|         level = Text() | ||||
|         level.append(record.levelname, log_style) | ||||
|         message_text = Text.from_markup(message) | ||||
|  | ||||
|         self.console.print( | ||||
|             self._log_render( | ||||
|                 self.console, | ||||
|                 [message_text], | ||||
|                 level=level, | ||||
|             ) | ||||
|         self._crawl_progress = Progress( | ||||
|             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||
|             BarColumn(), | ||||
|             TimeRemainingColumn(), | ||||
|             expand=True, | ||||
|         ) | ||||
|         self._download_progress = Progress( | ||||
|             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||
|             TransferSpeedColumn(), | ||||
|             DownloadColumn(), | ||||
|             BarColumn(), | ||||
|             TimeRemainingColumn(), | ||||
|             expand=True, | ||||
|         ) | ||||
|  | ||||
|         self._live = Live(console=self.console, transient=True) | ||||
|         self._update_live() | ||||
|  | ||||
| class PrettyLogger: | ||||
|     """ | ||||
|     A logger that prints some specially formatted log messages in color. | ||||
|     """ | ||||
|         self._showing_progress = False | ||||
|         self._progress_suspended = False | ||||
|         self._lock = asyncio.Lock() | ||||
|         self._lines: List[str] = [] | ||||
|  | ||||
|     def __init__(self, logger: logging.Logger) -> None: | ||||
|         self.logger = logger | ||||
|         # Whether different parts of the output are enabled or disabled | ||||
|         self.output_explain = False | ||||
|         self.output_status = True | ||||
|         self.output_report = True | ||||
|  | ||||
|     @staticmethod | ||||
|     def _format_path(path: PathLike) -> str: | ||||
|         return repr(str(to_path(path))) | ||||
|     def _update_live(self) -> None: | ||||
|         elements = [] | ||||
|         if self._crawl_progress.task_ids: | ||||
|             elements.append(self._crawl_progress) | ||||
|         if self._download_progress.task_ids: | ||||
|             elements.append(self._download_progress) | ||||
|  | ||||
|     def error(self, message: str) -> None: | ||||
|         group = Group(*elements) | ||||
|         self._live.update(group) | ||||
|  | ||||
|     @contextmanager | ||||
|     def show_progress(self) -> Iterator[None]: | ||||
|         if self._showing_progress: | ||||
|             raise RuntimeError("Calling 'show_progress' while already showing progress") | ||||
|  | ||||
|         self._showing_progress = True | ||||
|         try: | ||||
|             with self._live: | ||||
|                 yield | ||||
|         finally: | ||||
|             self._showing_progress = False | ||||
|  | ||||
|     @asynccontextmanager | ||||
|     async def exclusive_output(self) -> AsyncIterator[None]: | ||||
|         if not self._showing_progress: | ||||
|             raise RuntimeError("Calling 'exclusive_output' while not showing progress") | ||||
|  | ||||
|         async with self._lock: | ||||
|             self._progress_suspended = True | ||||
|             self._live.stop() | ||||
|             try: | ||||
|                 yield | ||||
|             finally: | ||||
|                 self._live.start() | ||||
|                 self._progress_suspended = False | ||||
|                 for line in self._lines: | ||||
|                     self.print(line) | ||||
|                 self._lines = [] | ||||
|  | ||||
|     def unlock(self) -> None: | ||||
|         """ | ||||
|         Print an error message indicating some operation fatally failed. | ||||
|         """ | ||||
|         self.logger.error( | ||||
|             f"[bold red]{message}[/bold red]" | ||||
|         ) | ||||
|         Get rid of an exclusive output state. | ||||
|  | ||||
|     def warning(self, message: str) -> None: | ||||
|         """ | ||||
|         Print a warning message indicating some operation failed, but the error can be recovered | ||||
|         or ignored. | ||||
|         """ | ||||
|         self.logger.warning( | ||||
|             f"[bold yellow]{message}[/bold yellow]" | ||||
|         ) | ||||
|  | ||||
|     def modified_file(self, path: PathLike) -> None: | ||||
|         """ | ||||
|         An existing file has changed. | ||||
|         This function is meant to let PFERD print log messages after the event | ||||
|         loop was forcibly stopped and if it will not be started up again. After | ||||
|         this is called, it is not safe to use any functions except the logging | ||||
|         functions (print, warn, ...). | ||||
|         """ | ||||
|  | ||||
|         self.logger.info( | ||||
|             f"[bold magenta]Modified {self._format_path(path)}.[/bold magenta]" | ||||
|         ) | ||||
|         self._progress_suspended = False | ||||
|         for line in self._lines: | ||||
|             self.print(line) | ||||
|  | ||||
|     def new_file(self, path: PathLike) -> None: | ||||
|     def print(self, text: str) -> None: | ||||
|         """ | ||||
|         A new file has been downloaded. | ||||
|         Print a normal message. Allows markup. | ||||
|         """ | ||||
|  | ||||
|         self.logger.info( | ||||
|             f"[bold green]Created {self._format_path(path)}.[/bold green]" | ||||
|         ) | ||||
|         if self._progress_suspended: | ||||
|             self._lines.append(text) | ||||
|         else: | ||||
|             self.console.print(text) | ||||
|  | ||||
|     def deleted_file(self, path: PathLike) -> None: | ||||
|     # TODO Print errors (and warnings?) to stderr | ||||
|  | ||||
|     def warn(self, text: str) -> None: | ||||
|         """ | ||||
|         A file has been deleted. | ||||
|         Print a warning message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.logger.info( | ||||
|             f"[bold red]Deleted {self._format_path(path)}.[/bold red]" | ||||
|         ) | ||||
|         self.print(f"[bold bright_red]Warning[/] {escape(text)}") | ||||
|  | ||||
|     def ignored_file(self, path: PathLike, reason: str) -> None: | ||||
|     def warn_contd(self, text: str) -> None: | ||||
|         """ | ||||
|         File was not downloaded or modified. | ||||
|         Print further lines of a warning message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.logger.info( | ||||
|             f"[dim]Ignored {self._format_path(path)} " | ||||
|             f"([/dim]{reason}[dim]).[/dim]" | ||||
|         ) | ||||
|         self.print(f"{escape(text)}") | ||||
|  | ||||
|     def searching(self, path: PathLike) -> None: | ||||
|     def error(self, text: str) -> None: | ||||
|         """ | ||||
|         A crawler searches a particular object. | ||||
|         Print an error message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.logger.info(f"Searching {self._format_path(path)}") | ||||
|         self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") | ||||
|  | ||||
|     def not_searching(self, path: PathLike, reason: str) -> None: | ||||
|     def error_contd(self, text: str) -> None: | ||||
|         """ | ||||
|         A crawler does not search a particular object. | ||||
|         Print further lines of an error message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.logger.info( | ||||
|             f"[dim]Not searching {self._format_path(path)} " | ||||
|             f"([/dim]{reason}[dim]).[/dim]" | ||||
|         ) | ||||
|         self.print(f"[red]{escape(text)}") | ||||
|  | ||||
|     def summary(self, download_summary: DownloadSummary) -> None: | ||||
|     def unexpected_exception(self) -> None: | ||||
|         """ | ||||
|         Prints a download summary. | ||||
|         Call this in an "except" clause to log an unexpected exception. | ||||
|         """ | ||||
|         self.logger.info("") | ||||
|         self.logger.info("[bold cyan]Download Summary[/bold cyan]") | ||||
|         if not download_summary.has_updates(): | ||||
|             self.logger.info("[bold dim]Nothing changed![/bold dim]") | ||||
|             return | ||||
|  | ||||
|         for new_file in download_summary.new_files: | ||||
|             self.new_file(new_file) | ||||
|         for modified_file in download_summary.modified_files: | ||||
|             self.modified_file(modified_file) | ||||
|         for deleted_files in download_summary.deleted_files: | ||||
|             self.deleted_file(deleted_files) | ||||
|         t, v, tb = sys.exc_info() | ||||
|         if t is None or v is None or tb is None: | ||||
|             # We're not currently handling an exception, so somebody probably | ||||
|             # called this function where they shouldn't. | ||||
|             self.error("Something unexpected happened") | ||||
|             self.error_contd("") | ||||
|             for line in traceback.format_stack(): | ||||
|                 self.error_contd(line[:-1])  # Without the newline | ||||
|             self.error_contd("") | ||||
|         else: | ||||
|             self.error("An unexpected exception occurred") | ||||
|             self.error_contd("") | ||||
|             self.error_contd(traceback.format_exc()) | ||||
|  | ||||
|     def starting_synchronizer( | ||||
|         # Our print function doesn't take types other than strings, but the | ||||
|         # underlying rich.print function does. This call is a special case | ||||
|         # anyways, and we're calling it internally, so this should be fine. | ||||
|         self.print(Panel.fit(""" | ||||
| Please copy your program output and send it to the PFERD maintainers, either | ||||
| directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | ||||
|         """.strip()))  # type: ignore | ||||
|  | ||||
|     def explain_topic(self, text: str) -> None: | ||||
|         """ | ||||
|         Print a top-level explain text. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         if self.output_explain: | ||||
|             self.print(f"[yellow]{escape(text)}") | ||||
|  | ||||
|     def explain(self, text: str) -> None: | ||||
|         """ | ||||
|         Print an indented explain text. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         if self.output_explain: | ||||
|             self.print(f"  {escape(text)}") | ||||
|  | ||||
|     def status(self, style: str, action: str, text: str, suffix: str = "") -> None: | ||||
|         """ | ||||
|         Print a status update while crawling. Allows markup in the "style" | ||||
|         argument which will be applied to the "action" string. | ||||
|         """ | ||||
|  | ||||
|         if self.output_status: | ||||
|             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||
|             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||
|  | ||||
|     def report(self, text: str) -> None: | ||||
|         """ | ||||
|         Print a report after crawling. Allows markup. | ||||
|         """ | ||||
|  | ||||
|         if self.output_report: | ||||
|             self.print(text) | ||||
|  | ||||
|     @contextmanager | ||||
|     def _bar( | ||||
|             self, | ||||
|             target_directory: PathLike, | ||||
|             synchronizer_name: str, | ||||
|             subject: Optional[str] = None, | ||||
|     ) -> None: | ||||
|             progress: Progress, | ||||
|             description: str, | ||||
|             total: Optional[float], | ||||
|     ) -> Iterator[ProgressBar]: | ||||
|         if total is None: | ||||
|             # Indeterminate progress bar | ||||
|             taskid = progress.add_task(description, start=False) | ||||
|         else: | ||||
|             taskid = progress.add_task(description, total=total) | ||||
|         self._update_live() | ||||
|  | ||||
|         try: | ||||
|             yield ProgressBar(progress, taskid) | ||||
|         finally: | ||||
|             progress.remove_task(taskid) | ||||
|             self._update_live() | ||||
|  | ||||
|     def crawl_bar( | ||||
|             self, | ||||
|             style: str, | ||||
|             action: str, | ||||
|             text: str, | ||||
|             total: Optional[float] = None, | ||||
|     ) -> ContextManager[ProgressBar]: | ||||
|         """ | ||||
|         A special message marking that a synchronizer has been started. | ||||
|         Allows markup in the "style" argument which will be applied to the | ||||
|         "action" string. | ||||
|         """ | ||||
|  | ||||
|         subject_str = f"{subject} " if subject else "" | ||||
|         self.logger.info("") | ||||
|         self.logger.info(( | ||||
|             f"[bold cyan]Synchronizing " | ||||
|             f"{subject_str}to {self._format_path(target_directory)} " | ||||
|             f"using the {synchronizer_name} synchronizer.[/bold cyan]" | ||||
|         )) | ||||
|         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||
|         description = f"{style}{action}[/] {text}" | ||||
|         return self._bar(self._crawl_progress, description, total) | ||||
|  | ||||
|     def download_bar( | ||||
|             self, | ||||
|             style: str, | ||||
|             action: str, | ||||
|             text: str, | ||||
|             total: Optional[float] = None, | ||||
|     ) -> ContextManager[ProgressBar]: | ||||
|         """ | ||||
|         Allows markup in the "style" argument which will be applied to the | ||||
|         "action" string. | ||||
|         """ | ||||
|  | ||||
|         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||
|         description = f"{style}{action}[/] {text}" | ||||
|         return self._bar(self._download_progress, description, total) | ||||
|  | ||||
|  | ||||
| log = Log() | ||||
|   | ||||
| @@ -1,149 +0,0 @@ | ||||
| """A simple helper for managing downloaded files. | ||||
|  | ||||
| A organizer is bound to a single directory. | ||||
| """ | ||||
|  | ||||
| import filecmp | ||||
| import logging | ||||
| import os | ||||
| import shutil | ||||
| from pathlib import Path, PurePath | ||||
| from typing import List, Optional, Set | ||||
|  | ||||
| from .download_summary import DownloadSummary | ||||
| from .location import Location | ||||
| from .logging import PrettyLogger | ||||
| from .utils import prompt_yes_no | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
|  | ||||
|  | ||||
| class FileAcceptException(Exception): | ||||
|     """An exception while accepting a file.""" | ||||
|  | ||||
|  | ||||
| class Organizer(Location): | ||||
|     """A helper for managing downloaded files.""" | ||||
|  | ||||
|     def __init__(self, path: Path): | ||||
|         """Create a new organizer for a given path.""" | ||||
|         super().__init__(path) | ||||
|         self._known_files: Set[Path] = set() | ||||
|  | ||||
|         # Keep the root dir | ||||
|         self._known_files.add(path.resolve()) | ||||
|  | ||||
|         self.download_summary = DownloadSummary() | ||||
|  | ||||
|     def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]: | ||||
|         """ | ||||
|         Move a file to this organizer and mark it. | ||||
|  | ||||
|         Returns the path the file was moved to, to allow the caller to adjust the metadata. | ||||
|         As you might still need to adjust the metadata when the file was identical | ||||
|         (e.g. update the timestamp), the path is also returned in this case. | ||||
|         In all other cases (ignored, not overwritten, etc.) this method returns None. | ||||
|         """ | ||||
|         # Windows limits the path length to 260 for *some* historical reason | ||||
|         # If you want longer paths, you will have to add the "\\?\" prefix in front of | ||||
|         # your path... | ||||
|         # See: | ||||
|         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation | ||||
|         if os.name == 'nt': | ||||
|             src_absolute = Path("\\\\?\\" + str(src.resolve())) | ||||
|             dst_absolute = Path("\\\\?\\" + str(self.resolve(dst))) | ||||
|         else: | ||||
|             src_absolute = src.resolve() | ||||
|             dst_absolute = self.resolve(dst) | ||||
|  | ||||
|         if not src_absolute.exists(): | ||||
|             raise FileAcceptException("Source file does not exist") | ||||
|  | ||||
|         if not src_absolute.is_file(): | ||||
|             raise FileAcceptException("Source is a directory") | ||||
|  | ||||
|         LOGGER.debug("Copying %s to %s", src_absolute, dst_absolute) | ||||
|  | ||||
|         if self._is_marked(dst): | ||||
|             PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") | ||||
|             if not prompt_yes_no(f"Overwrite file?", default=False): | ||||
|                 PRETTY.ignored_file(dst_absolute, "file was written previously") | ||||
|                 return None | ||||
|  | ||||
|         # Destination file is directory | ||||
|         if dst_absolute.exists() and dst_absolute.is_dir(): | ||||
|             if prompt_yes_no(f"Overwrite folder {dst_absolute} with file?", default=False): | ||||
|                 shutil.rmtree(dst_absolute) | ||||
|             else: | ||||
|                 PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") | ||||
|                 return None | ||||
|  | ||||
|         # Destination file exists | ||||
|         if dst_absolute.exists() and dst_absolute.is_file(): | ||||
|             if filecmp.cmp(str(src_absolute), str(dst_absolute), shallow=False): | ||||
|                 # Bail out, nothing more to do | ||||
|                 PRETTY.ignored_file(dst_absolute, "same file contents") | ||||
|                 self.mark(dst) | ||||
|                 return dst_absolute | ||||
|  | ||||
|             self.download_summary.add_modified_file(dst_absolute) | ||||
|             PRETTY.modified_file(dst_absolute) | ||||
|         else: | ||||
|             self.download_summary.add_new_file(dst_absolute) | ||||
|             PRETTY.new_file(dst_absolute) | ||||
|  | ||||
|         # Create parent dir if needed | ||||
|         dst_parent_dir: Path = dst_absolute.parent | ||||
|         dst_parent_dir.mkdir(exist_ok=True, parents=True) | ||||
|  | ||||
|         # Move file | ||||
|         shutil.move(str(src_absolute), str(dst_absolute)) | ||||
|  | ||||
|         self.mark(dst) | ||||
|  | ||||
|         return dst_absolute | ||||
|  | ||||
|     def mark(self, path: PurePath) -> None: | ||||
|         """Mark a file as used so it will not get cleaned up.""" | ||||
|         absolute_path = self.resolve(path) | ||||
|         self._known_files.add(absolute_path) | ||||
|         LOGGER.debug("Tracked %s", absolute_path) | ||||
|  | ||||
|     def _is_marked(self, path: PurePath) -> bool: | ||||
|         """ | ||||
|         Checks whether a file is marked. | ||||
|         """ | ||||
|         absolute_path = self.resolve(path) | ||||
|         return absolute_path in self._known_files | ||||
|  | ||||
|     def cleanup(self) -> None: | ||||
|         """Remove all untracked files in the organizer's dir.""" | ||||
|         LOGGER.debug("Deleting all untracked files...") | ||||
|  | ||||
|         self._cleanup(self.path) | ||||
|  | ||||
|     def _cleanup(self, start_dir: Path) -> None: | ||||
|         if not start_dir.exists(): | ||||
|             return | ||||
|         paths: List[Path] = list(start_dir.iterdir()) | ||||
|  | ||||
|         # Recursively clean paths | ||||
|         for path in paths: | ||||
|             if path.is_dir(): | ||||
|                 self._cleanup(path) | ||||
|             else: | ||||
|                 if path.resolve() not in self._known_files: | ||||
|                     self._delete_file_if_confirmed(path) | ||||
|  | ||||
|         # Delete dir if it was empty and untracked | ||||
|         dir_empty = len(list(start_dir.iterdir())) == 0 | ||||
|         if start_dir.resolve() not in self._known_files and dir_empty: | ||||
|             start_dir.rmdir() | ||||
|  | ||||
|     def _delete_file_if_confirmed(self, path: Path) -> None: | ||||
|         prompt = f"Do you want to delete {path}" | ||||
|  | ||||
|         if prompt_yes_no(prompt, False): | ||||
|             self.download_summary.add_deleted_file(path) | ||||
|             path.unlink() | ||||
							
								
								
									
										517
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										517
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,517 @@ | ||||
| import filecmp | ||||
| import json | ||||
| import os | ||||
| import random | ||||
| import shutil | ||||
| import string | ||||
| from contextlib import contextmanager | ||||
| from dataclasses import dataclass | ||||
| from datetime import datetime | ||||
| from enum import Enum | ||||
| from pathlib import Path, PurePath | ||||
| from typing import BinaryIO, Iterator, Optional, Tuple | ||||
|  | ||||
| from .logging import log | ||||
| from .report import Report, ReportLoadError | ||||
| from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no | ||||
|  | ||||
| SUFFIX_CHARS = string.ascii_lowercase + string.digits | ||||
| SUFFIX_LENGTH = 6 | ||||
| TRIES = 5 | ||||
|  | ||||
|  | ||||
| class OutputDirError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Redownload(Enum): | ||||
|     NEVER = "never" | ||||
|     NEVER_SMART = "never-smart" | ||||
|     ALWAYS = "always" | ||||
|     ALWAYS_SMART = "always-smart" | ||||
|  | ||||
|     @staticmethod | ||||
|     def from_string(string: str) -> "Redownload": | ||||
|         try: | ||||
|             return Redownload(string) | ||||
|         except ValueError: | ||||
|             raise ValueError("must be one of 'never', 'never-smart'," | ||||
|                              " 'always', 'always-smart'") | ||||
|  | ||||
|  | ||||
| class OnConflict(Enum): | ||||
|     PROMPT = "prompt" | ||||
|     LOCAL_FIRST = "local-first" | ||||
|     REMOTE_FIRST = "remote-first" | ||||
|     NO_DELETE = "no-delete" | ||||
|  | ||||
|     @staticmethod | ||||
|     def from_string(string: str) -> "OnConflict": | ||||
|         try: | ||||
|             return OnConflict(string) | ||||
|         except ValueError: | ||||
|             raise ValueError("must be one of 'prompt', 'local-first'," | ||||
|                              " 'remote-first', 'no-delete'") | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Heuristics: | ||||
|     mtime: Optional[datetime] | ||||
|  | ||||
|  | ||||
| class FileSink: | ||||
|     def __init__(self, file: BinaryIO): | ||||
|         self._file = file | ||||
|         self._done = False | ||||
|  | ||||
|     @property | ||||
|     def file(self) -> BinaryIO: | ||||
|         return self._file | ||||
|  | ||||
|     def done(self) -> None: | ||||
|         self._done = True | ||||
|  | ||||
|     def is_done(self) -> bool: | ||||
|         return self._done | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class DownloadInfo: | ||||
|     remote_path: PurePath | ||||
|     path: PurePath | ||||
|     local_path: Path | ||||
|     tmp_path: Path | ||||
|     heuristics: Heuristics | ||||
|     on_conflict: OnConflict | ||||
|     success: bool = False | ||||
|  | ||||
|  | ||||
| class FileSinkToken(ReusableAsyncContextManager[FileSink]): | ||||
|     # Whenever this class is entered, it creates a new temporary file and | ||||
|     # returns a corresponding FileSink. | ||||
|     # | ||||
|     # When it is exited again, the file is closed and information about the | ||||
|     # download handed back to the OutputDirectory. | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             output_dir: "OutputDirectory", | ||||
|             remote_path: PurePath, | ||||
|             path: PurePath, | ||||
|             local_path: Path, | ||||
|             heuristics: Heuristics, | ||||
|             on_conflict: OnConflict, | ||||
|     ): | ||||
|         super().__init__() | ||||
|  | ||||
|         self._output_dir = output_dir | ||||
|         self._remote_path = remote_path | ||||
|         self._path = path | ||||
|         self._local_path = local_path | ||||
|         self._heuristics = heuristics | ||||
|         self._on_conflict = on_conflict | ||||
|  | ||||
|     async def _on_aenter(self) -> FileSink: | ||||
|         tmp_path, file = await self._output_dir._create_tmp_file(self._local_path) | ||||
|         sink = FileSink(file) | ||||
|  | ||||
|         async def after_download() -> None: | ||||
|             await self._output_dir._after_download(DownloadInfo( | ||||
|                 self._remote_path, | ||||
|                 self._path, | ||||
|                 self._local_path, | ||||
|                 tmp_path, | ||||
|                 self._heuristics, | ||||
|                 self._on_conflict, | ||||
|                 sink.is_done(), | ||||
|             )) | ||||
|  | ||||
|         self._stack.push_async_callback(after_download) | ||||
|         self._stack.enter_context(file) | ||||
|  | ||||
|         return sink | ||||
|  | ||||
|  | ||||
| class OutputDirectory: | ||||
|     REPORT_FILE = PurePath(".report") | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             root: Path, | ||||
|             redownload: Redownload, | ||||
|             on_conflict: OnConflict, | ||||
|     ): | ||||
|         if os.name == "nt": | ||||
|             # Windows limits the path length to 260 for some historical reason. | ||||
|             # If you want longer paths, you will have to add the "\\?\" prefix | ||||
|             # in front of your path. See: | ||||
|             # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation | ||||
|             self._root = Path("\\\\?\\" + str(root.absolute())) | ||||
|         else: | ||||
|             self._root = root | ||||
|  | ||||
|         self._redownload = redownload | ||||
|         self._on_conflict = on_conflict | ||||
|  | ||||
|         self._report_path = self.resolve(self.REPORT_FILE) | ||||
|         self._report = Report() | ||||
|         self._prev_report: Optional[Report] = None | ||||
|  | ||||
|         self.register_reserved(self.REPORT_FILE) | ||||
|  | ||||
|     @property | ||||
|     def report(self) -> Report: | ||||
|         return self._report | ||||
|  | ||||
|     @property | ||||
|     def prev_report(self) -> Optional[Report]: | ||||
|         return self._prev_report | ||||
|  | ||||
|     def prepare(self) -> None: | ||||
|         log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") | ||||
|  | ||||
|         try: | ||||
|             self._root.mkdir(parents=True, exist_ok=True) | ||||
|         except OSError: | ||||
|             raise OutputDirError("Failed to create base directory") | ||||
|  | ||||
|     def register_reserved(self, path: PurePath) -> None: | ||||
|         self._report.mark_reserved(path) | ||||
|  | ||||
|     def resolve(self, path: PurePath) -> Path: | ||||
|         """ | ||||
|         May throw an OutputDirError. | ||||
|         """ | ||||
|  | ||||
|         if ".." in path.parts: | ||||
|             raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") | ||||
|         if "." in path.parts: | ||||
|             raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") | ||||
|  | ||||
|         return self._root / path | ||||
|  | ||||
|     def _should_download( | ||||
|             self, | ||||
|             local_path: Path, | ||||
|             heuristics: Heuristics, | ||||
|             redownload: Redownload, | ||||
|             on_conflict: OnConflict, | ||||
|     ) -> bool: | ||||
|         if not local_path.exists(): | ||||
|             log.explain("No corresponding file present locally") | ||||
|             return True | ||||
|  | ||||
|         if on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             # Whatever is here, it will never be overwritten, so we don't need | ||||
|             # to download the file. | ||||
|             log.explain("Conflict resolution is 'local-first' and path exists") | ||||
|             return False | ||||
|  | ||||
|         if not local_path.is_file(): | ||||
|             # We know that there is *something* here that's not a file. | ||||
|             log.explain("Non-file (probably a directory) present locally") | ||||
|  | ||||
|             # If on_conflict is LOCAL_FIRST or NO_DELETE, we know that it would | ||||
|             # never be overwritten. It also doesn't have any relevant stats to | ||||
|             # update. This means that we don't have to download the file | ||||
|             # because we'd just always throw it away again. | ||||
|             if on_conflict in {OnConflict.LOCAL_FIRST, OnConflict.NO_DELETE}: | ||||
|                 log.explain(f"Conflict resolution is {on_conflict.value!r}") | ||||
|                 return False | ||||
|  | ||||
|             return True | ||||
|  | ||||
|         log.explain(f"Redownload policy is {redownload.value}") | ||||
|  | ||||
|         if redownload == Redownload.NEVER: | ||||
|             return False | ||||
|         elif redownload == Redownload.ALWAYS: | ||||
|             return True | ||||
|  | ||||
|         stat = local_path.stat() | ||||
|  | ||||
|         remote_newer = None | ||||
|  | ||||
|         # Python on Windows crashes when faced with timestamps around the unix epoch | ||||
|         if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): | ||||
|             mtime = heuristics.mtime | ||||
|             remote_newer = mtime.timestamp() > stat.st_mtime | ||||
|             if remote_newer: | ||||
|                 log.explain("Remote file seems to be newer") | ||||
|             else: | ||||
|                 log.explain("Remote file doesn't seem to be newer") | ||||
|  | ||||
|         if redownload == Redownload.NEVER_SMART: | ||||
|             if remote_newer is None: | ||||
|                 return False | ||||
|             else: | ||||
|                 return remote_newer | ||||
|         elif redownload == Redownload.ALWAYS_SMART: | ||||
|             if remote_newer is None: | ||||
|                 return True | ||||
|             else: | ||||
|                 return remote_newer | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{redownload!r} is not a valid redownload policy") | ||||
|  | ||||
|     # The following conflict resolution functions all return False if the local | ||||
|     # file(s) should be kept and True if they should be replaced by the remote | ||||
|     # files. | ||||
|  | ||||
|     async def _conflict_lfrf( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Replace {fmt_path(path)} with remote file?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return True | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     async def _conflict_ldrf( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return False | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     async def _conflict_lfrd( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|             parent: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return False | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     async def _conflict_delete_lf( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Delete {fmt_path(path)}?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return False | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     def _tmp_path(self, base: Path, suffix_length: int) -> Path: | ||||
|         prefix = "" if base.name.startswith(".") else "." | ||||
|         suffix = "".join(random.choices(SUFFIX_CHARS, k=suffix_length)) | ||||
|         name = f"{prefix}{base.name}.tmp.{suffix}" | ||||
|         return base.parent / name | ||||
|  | ||||
|     async def _create_tmp_file( | ||||
|             self, | ||||
|             local_path: Path, | ||||
|     ) -> Tuple[Path, BinaryIO]: | ||||
|         """ | ||||
|         May raise an OutputDirError. | ||||
|         """ | ||||
|  | ||||
|         # Create tmp file | ||||
|         for attempt in range(TRIES): | ||||
|             suffix_length = SUFFIX_LENGTH + 2 * attempt | ||||
|             tmp_path = self._tmp_path(local_path, suffix_length) | ||||
|             try: | ||||
|                 return tmp_path, open(tmp_path, "xb") | ||||
|             except FileExistsError: | ||||
|                 pass  # Try again | ||||
|  | ||||
|         raise OutputDirError("Failed to create temporary file") | ||||
|  | ||||
|     async def download( | ||||
|             self, | ||||
|             remote_path: PurePath, | ||||
|             path: PurePath, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
|     ) -> Optional[FileSinkToken]: | ||||
|         """ | ||||
|         May throw an OutputDirError, a MarkDuplicateError or a | ||||
|         MarkConflictError. | ||||
|         """ | ||||
|  | ||||
|         heuristics = Heuristics(mtime) | ||||
|         redownload = self._redownload if redownload is None else redownload | ||||
|         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||
|         local_path = self.resolve(path) | ||||
|  | ||||
|         self._report.mark(path) | ||||
|  | ||||
|         if not self._should_download(local_path, heuristics, redownload, on_conflict): | ||||
|             return None | ||||
|  | ||||
|         # Detect and solve local-dir-remote-file conflict | ||||
|         if local_path.is_dir(): | ||||
|             log.explain("Conflict: There's a directory in place of the local file") | ||||
|             if await self._conflict_ldrf(on_conflict, path): | ||||
|                 log.explain("Result: Delete the obstructing directory") | ||||
|                 shutil.rmtree(local_path) | ||||
|             else: | ||||
|                 log.explain("Result: Keep the obstructing directory") | ||||
|                 return None | ||||
|  | ||||
|         # Detect and solve local-file-remote-dir conflict | ||||
|         for parent in path.parents: | ||||
|             local_parent = self.resolve(parent) | ||||
|             if local_parent.exists() and not local_parent.is_dir(): | ||||
|                 log.explain("Conflict: One of the local file's parents is a file") | ||||
|                 if await self._conflict_lfrd(on_conflict, path, parent): | ||||
|                     log.explain("Result: Delete the obstructing file") | ||||
|                     local_parent.unlink() | ||||
|                     break | ||||
|                 else: | ||||
|                     log.explain("Result: Keep the obstructing file") | ||||
|                     return None | ||||
|  | ||||
|         # Ensure parent directory exists | ||||
|         local_path.parent.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
|         return FileSinkToken(self, remote_path, path, local_path, heuristics, on_conflict) | ||||
|  | ||||
|     def _update_metadata(self, info: DownloadInfo) -> None: | ||||
|         if mtime := info.heuristics.mtime: | ||||
|             mtimestamp = mtime.timestamp() | ||||
|             os.utime(info.local_path, times=(mtimestamp, mtimestamp)) | ||||
|  | ||||
|     @contextmanager | ||||
|     def _ensure_deleted(self, path: Path) -> Iterator[None]: | ||||
|         try: | ||||
|             yield | ||||
|         finally: | ||||
|             path.unlink(missing_ok=True) | ||||
|  | ||||
|     async def _after_download(self, info: DownloadInfo) -> None: | ||||
|         with self._ensure_deleted(info.tmp_path): | ||||
|             log.status("[bold cyan]", "Downloaded", fmt_path(info.remote_path)) | ||||
|             log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") | ||||
|  | ||||
|             changed = False | ||||
|  | ||||
|             if not info.success: | ||||
|                 log.explain("Download unsuccessful, aborting") | ||||
|                 return | ||||
|  | ||||
|             # Solve conflicts arising from existing local file | ||||
|             if info.local_path.exists(): | ||||
|                 changed = True | ||||
|  | ||||
|                 if filecmp.cmp(info.local_path, info.tmp_path): | ||||
|                     log.explain("Contents identical with existing file") | ||||
|                     log.explain("Updating metadata of existing file") | ||||
|                     self._update_metadata(info) | ||||
|                     return | ||||
|  | ||||
|                 log.explain("Conflict: The local and remote versions differ") | ||||
|                 if await self._conflict_lfrf(info.on_conflict, info.path): | ||||
|                     log.explain("Result: Replacing local with remote version") | ||||
|                 else: | ||||
|                     log.explain("Result: Keeping local version") | ||||
|                     return | ||||
|  | ||||
|             info.tmp_path.replace(info.local_path) | ||||
|             log.explain("Updating file metadata") | ||||
|             self._update_metadata(info) | ||||
|  | ||||
|             if changed: | ||||
|                 log.status("[bold bright_yellow]", "Changed", fmt_path(info.path)) | ||||
|                 self._report.change_file(info.path) | ||||
|             else: | ||||
|                 log.status("[bold bright_green]", "Added", fmt_path(info.path)) | ||||
|                 self._report.add_file(info.path) | ||||
|  | ||||
|     async def cleanup(self) -> None: | ||||
|         await self._cleanup_dir(self._root, PurePath(), delete_self=False) | ||||
|  | ||||
|     async def _cleanup(self, path: Path, pure: PurePath) -> None: | ||||
|         if path.is_dir(): | ||||
|             await self._cleanup_dir(path, pure) | ||||
|         elif path.is_file(): | ||||
|             await self._cleanup_file(path, pure) | ||||
|  | ||||
|     async def _cleanup_dir(self, path: Path, pure: PurePath, delete_self: bool = True) -> None: | ||||
|         for child in sorted(path.iterdir()): | ||||
|             pure_child = pure / child.name | ||||
|             await self._cleanup(child, pure_child) | ||||
|  | ||||
|         if delete_self: | ||||
|             try: | ||||
|                 path.rmdir() | ||||
|             except OSError: | ||||
|                 pass | ||||
|  | ||||
|     async def _cleanup_file(self, path: Path, pure: PurePath) -> None: | ||||
|         if self._report.is_marked(pure): | ||||
|             return | ||||
|  | ||||
|         if await self._conflict_delete_lf(self._on_conflict, pure): | ||||
|             try: | ||||
|                 path.unlink() | ||||
|                 log.status("[bold bright_magenta]", "Deleted", fmt_path(pure)) | ||||
|                 self._report.delete_file(pure) | ||||
|             except OSError: | ||||
|                 pass | ||||
|         else: | ||||
|             log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) | ||||
|             self._report.not_delete_file(pure) | ||||
|  | ||||
|     def load_prev_report(self) -> None: | ||||
|         log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") | ||||
|         try: | ||||
|             self._prev_report = Report.load(self._report_path) | ||||
|             log.explain("Loaded report successfully") | ||||
|         except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e: | ||||
|             log.explain("Failed to load report") | ||||
|             log.explain(str(e)) | ||||
|  | ||||
|     def store_report(self) -> None: | ||||
|         log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}") | ||||
|         try: | ||||
|             self._report.store(self._report_path) | ||||
|             log.explain("Stored report successfully") | ||||
|         except OSError as e: | ||||
|             log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}") | ||||
|             log.warn_contd(str(e)) | ||||
							
								
								
									
										543
									
								
								PFERD/pferd.py
									
									
									
									
									
								
							
							
						
						
									
										543
									
								
								PFERD/pferd.py
									
									
									
									
									
								
							| @@ -1,413 +1,194 @@ | ||||
| """ | ||||
| Convenience functions for using PFERD. | ||||
| """ | ||||
|  | ||||
| import logging | ||||
| from pathlib import Path | ||||
| from typing import Callable, List, Optional, Union | ||||
| from typing import Dict, List, Optional | ||||
|  | ||||
| from .cookie_jar import CookieJar | ||||
| from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, | ||||
|                    diva_download_new) | ||||
| from .download_summary import DownloadSummary | ||||
| from .errors import FatalException, swallow_and_print_errors | ||||
| from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, | ||||
|                     IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, | ||||
|                     KitShibbolethAuthenticator, download_modified_or_new) | ||||
| from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo, | ||||
|                   IpdDownloadStrategy, ipd_download_new_or_modified) | ||||
| from .location import Location | ||||
| from .logging import PrettyLogger, enable_logging | ||||
| from .organizer import Organizer | ||||
| from .tmp_dir import TmpDir | ||||
| from .transform import TF, Transform, apply_transform | ||||
| from .utils import PathLike, to_path | ||||
| from rich.markup import escape | ||||
|  | ||||
| # TODO save known-good cookies as soon as possible | ||||
| from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection | ||||
| from .config import Config, ConfigOptionError | ||||
| from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler | ||||
| from .logging import log | ||||
| from .utils import fmt_path | ||||
|  | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
| PRETTY = PrettyLogger(LOGGER) | ||||
| class PferdLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Pferd(Location): | ||||
|     # pylint: disable=too-many-arguments | ||||
|     """ | ||||
|     The main entrypoint in your Pferd usage: This class combines a number of | ||||
|     useful shortcuts for running synchronizers in a single interface. | ||||
|     """ | ||||
| class Pferd: | ||||
|     def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]): | ||||
|         """ | ||||
|         May throw PferdLoadError. | ||||
|         """ | ||||
|  | ||||
|     def __init__( | ||||
|         self._config = config | ||||
|         self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips) | ||||
|  | ||||
|         self._authenticators: Dict[str, Authenticator] = {} | ||||
|         self._crawlers: Dict[str, Crawler] = {} | ||||
|  | ||||
|     def _find_config_crawlers(self, config: Config) -> List[str]: | ||||
|         crawl_sections = [] | ||||
|  | ||||
|         for name, section in config.crawl_sections(): | ||||
|             if CrawlerSection(section).skip(): | ||||
|                 log.explain(f"Skipping {name!r}") | ||||
|             else: | ||||
|                 crawl_sections.append(name) | ||||
|  | ||||
|         return crawl_sections | ||||
|  | ||||
|     def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: | ||||
|         if len(cli_crawlers) != len(set(cli_crawlers)): | ||||
|             raise PferdLoadError("Some crawlers were selected multiple times") | ||||
|  | ||||
|         crawl_sections = [name for name, _ in config.crawl_sections()] | ||||
|  | ||||
|         crawlers_to_run = []  # With crawl: prefix | ||||
|         unknown_names = []  # Without crawl: prefix | ||||
|  | ||||
|         for name in cli_crawlers: | ||||
|             section_name = f"crawl:{name}" | ||||
|             if section_name in crawl_sections: | ||||
|                 log.explain(f"Crawler section named {section_name!r} exists") | ||||
|                 crawlers_to_run.append(section_name) | ||||
|             else: | ||||
|                 log.explain(f"There's no crawler section named {section_name!r}") | ||||
|                 unknown_names.append(name) | ||||
|  | ||||
|         if unknown_names: | ||||
|             if len(unknown_names) == 1: | ||||
|                 [name] = unknown_names | ||||
|                 raise PferdLoadError(f"There is no crawler named {name!r}") | ||||
|             else: | ||||
|                 names_str = ", ".join(repr(name) for name in unknown_names) | ||||
|                 raise PferdLoadError(f"There are no crawlers named {names_str}") | ||||
|  | ||||
|         return crawlers_to_run | ||||
|  | ||||
|     def _find_crawlers_to_run( | ||||
|             self, | ||||
|             base_dir: Path, | ||||
|             tmp_dir: Path = Path(".tmp"), | ||||
|             test_run: bool = False | ||||
|     ): | ||||
|         super().__init__(Path(base_dir)) | ||||
|             config: Config, | ||||
|             cli_crawlers: Optional[List[str]], | ||||
|             cli_skips: Optional[List[str]], | ||||
|     ) -> List[str]: | ||||
|         log.explain_topic("Deciding which crawlers to run") | ||||
|  | ||||
|         self._download_summary = DownloadSummary() | ||||
|         self._tmp_dir = TmpDir(self.resolve(tmp_dir)) | ||||
|         self._test_run = test_run | ||||
|  | ||||
|     @staticmethod | ||||
|     def enable_logging() -> None: | ||||
|         """ | ||||
|         Enable and configure logging via the logging module. | ||||
|         """ | ||||
|  | ||||
|         enable_logging() | ||||
|  | ||||
|     @staticmethod | ||||
|     def _print_transformables(transformables: List[TF]) -> None: | ||||
|         LOGGER.info("") | ||||
|         LOGGER.info("Results of the test run:") | ||||
|         for transformable in transformables: | ||||
|             LOGGER.info(transformable.path) | ||||
|  | ||||
|     def _ilias( | ||||
|             self, | ||||
|             target: PathLike, | ||||
|             base_url: str, | ||||
|             crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]], | ||||
|             authenticator: IliasAuthenticator, | ||||
|             cookies: Optional[PathLike], | ||||
|             dir_filter: IliasDirectoryFilter, | ||||
|             transform: Transform, | ||||
|             download_strategy: IliasDownloadStrategy, | ||||
|             timeout: int, | ||||
|             clean: bool = True, | ||||
|     ) -> Organizer: | ||||
|         # pylint: disable=too-many-locals | ||||
|         cookie_jar = CookieJar(to_path(cookies) if cookies else None) | ||||
|         session = cookie_jar.create_session() | ||||
|         tmp_dir = self._tmp_dir.new_subdir() | ||||
|         organizer = Organizer(self.resolve(to_path(target))) | ||||
|  | ||||
|         crawler = IliasCrawler(base_url, session, authenticator, dir_filter) | ||||
|         downloader = IliasDownloader(tmp_dir, organizer, session, | ||||
|                                      authenticator, download_strategy, timeout) | ||||
|  | ||||
|         cookie_jar.load_cookies() | ||||
|         info = crawl_function(crawler) | ||||
|         cookie_jar.save_cookies() | ||||
|  | ||||
|         transformed = apply_transform(transform, info) | ||||
|         if self._test_run: | ||||
|             self._print_transformables(transformed) | ||||
|             return organizer | ||||
|  | ||||
|         downloader.download_all(transformed) | ||||
|         cookie_jar.save_cookies() | ||||
|  | ||||
|         if clean: | ||||
|             organizer.cleanup() | ||||
|  | ||||
|         return organizer | ||||
|  | ||||
|     @swallow_and_print_errors | ||||
|     def ilias_kit( | ||||
|             self, | ||||
|             target: PathLike, | ||||
|             course_id: str, | ||||
|             dir_filter: IliasDirectoryFilter = lambda x, y: True, | ||||
|             transform: Transform = lambda x: x, | ||||
|             cookies: Optional[PathLike] = None, | ||||
|             username: Optional[str] = None, | ||||
|             password: Optional[str] = None, | ||||
|             download_strategy: IliasDownloadStrategy = download_modified_or_new, | ||||
|             clean: bool = True, | ||||
|             timeout: int = 5, | ||||
|     ) -> Organizer: | ||||
|         """ | ||||
|         Synchronizes a folder with the ILIAS instance of the KIT. | ||||
|  | ||||
|         Arguments: | ||||
|             target {Path} -- the target path to write the data to | ||||
|             course_id {str} -- the id of the main course page (found in the URL after ref_id | ||||
|                 when opening the course homepage) | ||||
|  | ||||
|         Keyword Arguments: | ||||
|             dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the | ||||
|                 crawler level, these directories and all of their content is skipped. | ||||
|                 (default: {lambdax:True}) | ||||
|             transform {Transform} -- A transformation function for the output paths. Return None | ||||
|                 to ignore a file. (default: {lambdax:x}) | ||||
|             cookies {Optional[Path]} -- The path to store and load cookies from. | ||||
|                 (default: {None}) | ||||
|             username {Optional[str]} -- The SCC username. If none is given, it will prompt | ||||
|                 the user. (default: {None}) | ||||
|             password {Optional[str]} -- The SCC password. If none is given, it will prompt | ||||
|                 the user. (default: {None}) | ||||
|             download_strategy {DownloadStrategy} -- A function to determine which files need to | ||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. | ||||
|                 (default: {download_modified_or_new}) | ||||
|             clean {bool} -- Whether to clean up when the method finishes. | ||||
|             timeout {int} -- The download timeout for opencast videos. Sadly needed due to a | ||||
|                 requests bug. | ||||
|         """ | ||||
|         # This authenticator only works with the KIT ilias instance. | ||||
|         authenticator = KitShibbolethAuthenticator(username=username, password=password) | ||||
|         PRETTY.starting_synchronizer(target, "ILIAS", course_id) | ||||
|  | ||||
|         organizer = self._ilias( | ||||
|             target=target, | ||||
|             base_url="https://ilias.studium.kit.edu/", | ||||
|             crawl_function=lambda crawler: crawler.crawl_course(course_id), | ||||
|             authenticator=authenticator, | ||||
|             cookies=cookies, | ||||
|             dir_filter=dir_filter, | ||||
|             transform=transform, | ||||
|             download_strategy=download_strategy, | ||||
|             clean=clean, | ||||
|             timeout=timeout | ||||
|         ) | ||||
|  | ||||
|         self._download_summary.merge(organizer.download_summary) | ||||
|  | ||||
|         return organizer | ||||
|  | ||||
|     def print_summary(self) -> None: | ||||
|         """ | ||||
|         Prints the accumulated download summary. | ||||
|         """ | ||||
|         PRETTY.summary(self._download_summary) | ||||
|  | ||||
|     @swallow_and_print_errors | ||||
|     def ilias_kit_personal_desktop( | ||||
|             self, | ||||
|             target: PathLike, | ||||
|             dir_filter: IliasDirectoryFilter = lambda x, y: True, | ||||
|             transform: Transform = lambda x: x, | ||||
|             cookies: Optional[PathLike] = None, | ||||
|             username: Optional[str] = None, | ||||
|             password: Optional[str] = None, | ||||
|             download_strategy: IliasDownloadStrategy = download_modified_or_new, | ||||
|             clean: bool = True, | ||||
|             timeout: int = 5, | ||||
|     ) -> Organizer: | ||||
|         """ | ||||
|         Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS | ||||
|         "personal desktop" instead of a single course. | ||||
|  | ||||
|         Arguments: | ||||
|             target {Path} -- the target path to write the data to | ||||
|  | ||||
|         Keyword Arguments: | ||||
|             dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the | ||||
|                 crawler level, these directories and all of their content is skipped. | ||||
|                 (default: {lambdax:True}) | ||||
|             transform {Transform} -- A transformation function for the output paths. Return None | ||||
|                 to ignore a file. (default: {lambdax:x}) | ||||
|             cookies {Optional[Path]} -- The path to store and load cookies from. | ||||
|                 (default: {None}) | ||||
|             username {Optional[str]} -- The SCC username. If none is given, it will prompt | ||||
|                 the user. (default: {None}) | ||||
|             password {Optional[str]} -- The SCC password. If none is given, it will prompt | ||||
|                 the user. (default: {None}) | ||||
|             download_strategy {DownloadStrategy} -- A function to determine which files need to | ||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. | ||||
|                 (default: {download_modified_or_new}) | ||||
|             clean {bool} -- Whether to clean up when the method finishes. | ||||
|             timeout {int} -- The download timeout for opencast videos. Sadly needed due to a | ||||
|                 requests bug. | ||||
|         """ | ||||
|         # This authenticator only works with the KIT ilias instance. | ||||
|         authenticator = KitShibbolethAuthenticator(username=username, password=password) | ||||
|         PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop") | ||||
|  | ||||
|         organizer = self._ilias( | ||||
|             target=target, | ||||
|             base_url="https://ilias.studium.kit.edu/", | ||||
|             crawl_function=lambda crawler: crawler.crawl_personal_desktop(), | ||||
|             authenticator=authenticator, | ||||
|             cookies=cookies, | ||||
|             dir_filter=dir_filter, | ||||
|             transform=transform, | ||||
|             download_strategy=download_strategy, | ||||
|             clean=clean, | ||||
|             timeout=timeout | ||||
|         ) | ||||
|  | ||||
|         self._download_summary.merge(organizer.download_summary) | ||||
|  | ||||
|         return organizer | ||||
|  | ||||
|     @swallow_and_print_errors | ||||
|     def ilias_kit_folder( | ||||
|             self, | ||||
|             target: PathLike, | ||||
|             full_url: str, | ||||
|             dir_filter: IliasDirectoryFilter = lambda x, y: True, | ||||
|             transform: Transform = lambda x: x, | ||||
|             cookies: Optional[PathLike] = None, | ||||
|             username: Optional[str] = None, | ||||
|             password: Optional[str] = None, | ||||
|             download_strategy: IliasDownloadStrategy = download_modified_or_new, | ||||
|             clean: bool = True, | ||||
|             timeout: int = 5, | ||||
|     ) -> Organizer: | ||||
|         """ | ||||
|         Synchronizes a folder with a given folder on the ILIAS instance of the KIT. | ||||
|  | ||||
|         Arguments: | ||||
|             target {Path}  -- the target path to write the data to | ||||
|             full_url {str} -- the full url of the folder/videos/course to crawl | ||||
|  | ||||
|         Keyword Arguments: | ||||
|             dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the | ||||
|                 crawler level, these directories and all of their content is skipped. | ||||
|                 (default: {lambdax:True}) | ||||
|             transform {Transform} -- A transformation function for the output paths. Return None | ||||
|                 to ignore a file. (default: {lambdax:x}) | ||||
|             cookies {Optional[Path]} -- The path to store and load cookies from. | ||||
|                 (default: {None}) | ||||
|             username {Optional[str]} -- The SCC username. If none is given, it will prompt | ||||
|                 the user. (default: {None}) | ||||
|             password {Optional[str]} -- The SCC password. If none is given, it will prompt | ||||
|                 the user. (default: {None}) | ||||
|             download_strategy {DownloadStrategy} -- A function to determine which files need to | ||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. | ||||
|                 (default: {download_modified_or_new}) | ||||
|             clean {bool} -- Whether to clean up when the method finishes. | ||||
|             timeout {int} -- The download timeout for opencast videos. Sadly needed due to a | ||||
|                 requests bug. | ||||
|         """ | ||||
|         # This authenticator only works with the KIT ilias instance. | ||||
|         authenticator = KitShibbolethAuthenticator(username=username, password=password) | ||||
|         PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url") | ||||
|  | ||||
|         if not full_url.startswith("https://ilias.studium.kit.edu"): | ||||
|             raise FatalException("Not a valid KIT ILIAS URL") | ||||
|  | ||||
|         organizer = self._ilias( | ||||
|             target=target, | ||||
|             base_url="https://ilias.studium.kit.edu/", | ||||
|             crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url), | ||||
|             authenticator=authenticator, | ||||
|             cookies=cookies, | ||||
|             dir_filter=dir_filter, | ||||
|             transform=transform, | ||||
|             download_strategy=download_strategy, | ||||
|             clean=clean, | ||||
|             timeout=timeout | ||||
|         ) | ||||
|  | ||||
|         self._download_summary.merge(organizer.download_summary) | ||||
|  | ||||
|         return organizer | ||||
|  | ||||
|     @swallow_and_print_errors | ||||
|     def ipd_kit( | ||||
|             self, | ||||
|             target: Union[PathLike, Organizer], | ||||
|             url: str, | ||||
|             transform: Transform = lambda x: x, | ||||
|             download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified, | ||||
|             clean: bool = True | ||||
|     ) -> Organizer: | ||||
|         """ | ||||
|         Synchronizes a folder with a DIVA playlist. | ||||
|  | ||||
|         Arguments: | ||||
|             target {Union[PathLike, Organizer]} -- The organizer / target folder to use. | ||||
|             url {str} -- the url to the page | ||||
|  | ||||
|         Keyword Arguments: | ||||
|             transform {Transform} -- A transformation function for the output paths. Return None | ||||
|                 to ignore a file. (default: {lambdax:x}) | ||||
|             download_strategy {DivaDownloadStrategy} -- A function to determine which files need to | ||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. | ||||
|                 (default: {diva_download_new}) | ||||
|             clean {bool} -- Whether to clean up when the method finishes. | ||||
|         """ | ||||
|         tmp_dir = self._tmp_dir.new_subdir() | ||||
|  | ||||
|         if target is None: | ||||
|             PRETTY.starting_synchronizer("None", "IPD", url) | ||||
|             raise FatalException("Got 'None' as target directory, aborting") | ||||
|  | ||||
|         if isinstance(target, Organizer): | ||||
|             organizer = target | ||||
|         crawlers: List[str] | ||||
|         if cli_crawlers is None: | ||||
|             log.explain("No crawlers specified on CLI") | ||||
|             log.explain("Running crawlers specified in config") | ||||
|             crawlers = self._find_config_crawlers(config) | ||||
|         else: | ||||
|             organizer = Organizer(self.resolve(to_path(target))) | ||||
|             log.explain("Crawlers specified on CLI") | ||||
|             crawlers = self._find_cli_crawlers(config, cli_crawlers) | ||||
|  | ||||
|         PRETTY.starting_synchronizer(organizer.path, "IPD", url) | ||||
|         skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set() | ||||
|         for crawler in crawlers: | ||||
|             if crawler in skips: | ||||
|                 log.explain(f"Skipping crawler {crawler!r}") | ||||
|         crawlers = [crawler for crawler in crawlers if crawler not in skips] | ||||
|  | ||||
|         elements: List[IpdDownloadInfo] = IpdCrawler(url).crawl() | ||||
|         transformed = apply_transform(transform, elements) | ||||
|         return crawlers | ||||
|  | ||||
|         if self._test_run: | ||||
|             self._print_transformables(transformed) | ||||
|             return organizer | ||||
|     def _load_authenticators(self) -> None: | ||||
|         for name, section in self._config.auth_sections(): | ||||
|             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||
|  | ||||
|         downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy) | ||||
|         downloader.download_all(transformed) | ||||
|             auth_type = AuthSection(section).type() | ||||
|             authenticator_constructor = AUTHENTICATORS.get(auth_type) | ||||
|             if authenticator_constructor is None: | ||||
|                 raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") | ||||
|  | ||||
|         if clean: | ||||
|             organizer.cleanup() | ||||
|             authenticator = authenticator_constructor(name, section, self._config) | ||||
|             self._authenticators[name] = authenticator | ||||
|  | ||||
|         self._download_summary.merge(organizer.download_summary) | ||||
|     def _load_crawlers(self) -> None: | ||||
|         # Cookie sharing | ||||
|         kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} | ||||
|  | ||||
|         return organizer | ||||
|         for name, section in self._config.crawl_sections(): | ||||
|             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||
|  | ||||
|     @swallow_and_print_errors | ||||
|     def diva_kit( | ||||
|             self, | ||||
|             target: Union[PathLike, Organizer], | ||||
|             playlist_location: str, | ||||
|             transform: Transform = lambda x: x, | ||||
|             download_strategy: DivaDownloadStrategy = diva_download_new, | ||||
|             clean: bool = True | ||||
|     ) -> Organizer: | ||||
|             crawl_type = CrawlerSection(section).type() | ||||
|             crawler_constructor = CRAWLERS.get(crawl_type) | ||||
|             if crawler_constructor is None: | ||||
|                 raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") | ||||
|  | ||||
|             crawler = crawler_constructor(name, section, self._config, self._authenticators) | ||||
|             self._crawlers[name] = crawler | ||||
|  | ||||
|             if self._config.default_section.share_cookies(): | ||||
|                 if isinstance(crawler, KitIliasWebCrawler): | ||||
|                     crawler.share_cookies(kit_ilias_web_paths) | ||||
|  | ||||
|     def debug_transforms(self) -> None: | ||||
|         for name in self._crawlers_to_run: | ||||
|             crawler = self._crawlers[name] | ||||
|             log.print("") | ||||
|             log.print(f"[bold bright_cyan]Debugging transforms[/] for {escape(name)}") | ||||
|             crawler.debug_transforms() | ||||
|  | ||||
|     async def run(self, debug_transforms: bool) -> None: | ||||
|         """ | ||||
|         Synchronizes a folder with a DIVA playlist. | ||||
|  | ||||
|         Arguments: | ||||
|             organizer {Organizer} -- The organizer to use. | ||||
|             playlist_location {str} -- the playlist id or the playlist URL | ||||
|               in the format 'https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271' | ||||
|  | ||||
|         Keyword Arguments: | ||||
|             transform {Transform} -- A transformation function for the output paths. Return None | ||||
|                 to ignore a file. (default: {lambdax:x}) | ||||
|             download_strategy {DivaDownloadStrategy} -- A function to determine which files need to | ||||
|                 be downloaded. Can save bandwidth and reduce the number of requests. | ||||
|                 (default: {diva_download_new}) | ||||
|             clean {bool} -- Whether to clean up when the method finishes. | ||||
|         May throw ConfigOptionError. | ||||
|         """ | ||||
|         tmp_dir = self._tmp_dir.new_subdir() | ||||
|  | ||||
|         if playlist_location.startswith("http"): | ||||
|             playlist_id = DivaPlaylistCrawler.fetch_id(playlist_link=playlist_location) | ||||
|         else: | ||||
|             playlist_id = playlist_location | ||||
|         # These two functions must run inside the same event loop as the | ||||
|         # crawlers, so that any new objects (like Conditions or Futures) can | ||||
|         # obtain the correct event loop. | ||||
|         self._load_authenticators() | ||||
|         self._load_crawlers() | ||||
|  | ||||
|         if target is None: | ||||
|             PRETTY.starting_synchronizer("None", "DIVA", playlist_id) | ||||
|             raise FatalException("Got 'None' as target directory, aborting") | ||||
|         if debug_transforms: | ||||
|             log.output_explain = True | ||||
|             log.output_report = False | ||||
|             self.debug_transforms() | ||||
|             return | ||||
|  | ||||
|         if isinstance(target, Organizer): | ||||
|             organizer = target | ||||
|         else: | ||||
|             organizer = Organizer(self.resolve(to_path(target))) | ||||
|         log.print("") | ||||
|  | ||||
|         PRETTY.starting_synchronizer(organizer.path, "DIVA", playlist_id) | ||||
|         for name in self._crawlers_to_run: | ||||
|             crawler = self._crawlers[name] | ||||
|  | ||||
|         crawler = DivaPlaylistCrawler(playlist_id) | ||||
|         downloader = DivaDownloader(tmp_dir, organizer, download_strategy) | ||||
|             log.print(f"[bold bright_cyan]Running[/] {escape(name)}") | ||||
|  | ||||
|         info = crawler.crawl() | ||||
|             try: | ||||
|                 await crawler.run() | ||||
|             except (CrawlError, AuthError) as e: | ||||
|                 log.error(str(e)) | ||||
|             except Exception: | ||||
|                 log.unexpected_exception() | ||||
|  | ||||
|         transformed = apply_transform(transform, info) | ||||
|         if self._test_run: | ||||
|             self._print_transformables(transformed) | ||||
|             return organizer | ||||
|     def print_report(self) -> None: | ||||
|         for name in self._crawlers_to_run: | ||||
|             crawler = self._crawlers.get(name) | ||||
|             if crawler is None: | ||||
|                 continue  # Crawler failed to load | ||||
|  | ||||
|         downloader.download_all(transformed) | ||||
|             log.report("") | ||||
|             log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") | ||||
|  | ||||
|         if clean: | ||||
|             organizer.cleanup() | ||||
|             something_changed = False | ||||
|             for path in sorted(crawler.report.added_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_green]Added[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.changed_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_yellow]Changed[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.deleted_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.not_deleted_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_magenta]Not deleted[/] {fmt_path(path)}") | ||||
|  | ||||
|         self._download_summary.merge(organizer.download_summary) | ||||
|             for warning in crawler.report.encountered_warnings: | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_red]Warning[/] {warning}") | ||||
|  | ||||
|         return organizer | ||||
|             for error in crawler.report.encountered_errors: | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_red]Error[/] {error}") | ||||
|  | ||||
|             if not something_changed: | ||||
|                 log.report("  Nothing changed") | ||||
|   | ||||
| @@ -1,111 +0,0 @@ | ||||
| """ | ||||
| A small progress bar implementation. | ||||
| """ | ||||
| import sys | ||||
| from dataclasses import dataclass | ||||
| from types import TracebackType | ||||
| from typing import Optional, Type | ||||
|  | ||||
| import requests | ||||
| from rich.console import Console | ||||
| from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, | ||||
|                            TextColumn, TimeRemainingColumn, | ||||
|                            TransferSpeedColumn) | ||||
|  | ||||
| _progress: Progress = Progress( | ||||
|     TextColumn("[bold blue]{task.fields[name]}", justify="right"), | ||||
|     BarColumn(bar_width=None), | ||||
|     "[progress.percentage]{task.percentage:>3.1f}%", | ||||
|     "•", | ||||
|     DownloadColumn(), | ||||
|     "•", | ||||
|     TransferSpeedColumn(), | ||||
|     "•", | ||||
|     TimeRemainingColumn(), | ||||
|     console=Console(file=sys.stdout), | ||||
|     transient=True | ||||
| ) | ||||
|  | ||||
|  | ||||
| def size_from_headers(response: requests.Response) -> Optional[int]: | ||||
|     """ | ||||
|     Return the size of the download based on the response headers. | ||||
|  | ||||
|     Arguments: | ||||
|         response {requests.Response} -- the response | ||||
|  | ||||
|     Returns: | ||||
|         Optional[int] -- the size | ||||
|     """ | ||||
|     if "Content-Length" in response.headers: | ||||
|         return int(response.headers["Content-Length"]) | ||||
|     return None | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class ProgressSettings: | ||||
|     """ | ||||
|     Settings you can pass to customize the progress bar. | ||||
|     """ | ||||
|     name: str | ||||
|     max_size: int | ||||
|  | ||||
|  | ||||
| def progress_for(settings: Optional[ProgressSettings]) -> 'ProgressContextManager': | ||||
|     """ | ||||
|     Returns a context manager that displays progress | ||||
|  | ||||
|     Returns: | ||||
|         ProgressContextManager -- the progress manager | ||||
|     """ | ||||
|     return ProgressContextManager(settings) | ||||
|  | ||||
|  | ||||
| class ProgressContextManager: | ||||
|     """ | ||||
|     A context manager used for displaying progress. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, settings: Optional[ProgressSettings]): | ||||
|         self._settings = settings | ||||
|         self._task_id: Optional[TaskID] = None | ||||
|  | ||||
|     def __enter__(self) -> 'ProgressContextManager': | ||||
|         """Context manager entry function.""" | ||||
|         if not self._settings: | ||||
|             return self | ||||
|  | ||||
|         _progress.start() | ||||
|         self._task_id = _progress.add_task( | ||||
|             self._settings.name, | ||||
|             total=self._settings.max_size, | ||||
|             name=self._settings.name | ||||
|         ) | ||||
|         return self | ||||
|  | ||||
|     # pylint: disable=useless-return | ||||
|     def __exit__( | ||||
|             self, | ||||
|             exc_type: Optional[Type[BaseException]], | ||||
|             exc_value: Optional[BaseException], | ||||
|             traceback: Optional[TracebackType], | ||||
|     ) -> Optional[bool]: | ||||
|         """Context manager exit function. Removes the task.""" | ||||
|         if self._task_id is None: | ||||
|             return None | ||||
|  | ||||
|         _progress.remove_task(self._task_id) | ||||
|  | ||||
|         if len(_progress.task_ids) == 0: | ||||
|             # We need to clean up after ourselves, as we were the last one | ||||
|             _progress.stop() | ||||
|             _progress.refresh() | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     def advance(self, amount: float) -> None: | ||||
|         """ | ||||
|         Advances the progress bar. | ||||
|         """ | ||||
|         if self._task_id is not None: | ||||
|             _progress.advance(self._task_id, amount) | ||||
							
								
								
									
										238
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										238
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,238 @@ | ||||
| import json | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Dict, List, Optional, Set | ||||
|  | ||||
|  | ||||
| class ReportLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class MarkDuplicateError(Exception): | ||||
|     """ | ||||
|     Tried to mark a file that was already marked. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: PurePath): | ||||
|         super().__init__(f"A previous file already used path {path}") | ||||
|         self.path = path | ||||
|  | ||||
|  | ||||
| class MarkConflictError(Exception): | ||||
|     """ | ||||
|     Marking the path would have caused a conflict. | ||||
|  | ||||
|     A conflict can have two reasons: Either the new file has the same path as | ||||
|     the parent directory of a known file, or a parent directory of the new file | ||||
|     has the same path as a known file. In either case, adding the new file | ||||
|     would require a file and a directory to share the same path, which is | ||||
|     usually not possible. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: PurePath, collides_with: PurePath): | ||||
|         super().__init__(f"File at {path} collides with previous file at {collides_with}") | ||||
|         self.path = path | ||||
|         self.collides_with = collides_with | ||||
|  | ||||
|  | ||||
| # TODO Use PurePath.is_relative_to when updating to 3.9 | ||||
| def is_relative_to(a: PurePath, b: PurePath) -> bool: | ||||
|     try: | ||||
|         a.relative_to(b) | ||||
|         return True | ||||
|     except ValueError: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| class Report: | ||||
|     """ | ||||
|     A report of a synchronization. Includes all files found by the crawler, as | ||||
|     well as the set of changes made to local files. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self) -> None: | ||||
|         # Paths found by the crawler, untransformed | ||||
|         self.found_paths: Set[PurePath] = set() | ||||
|  | ||||
|         # Files reserved for metadata files (e. g. the report file or cookies) | ||||
|         # that can't be overwritten by user transforms and won't be cleaned up | ||||
|         # at the end. | ||||
|         self.reserved_files: Set[PurePath] = set() | ||||
|  | ||||
|         # Files found by the crawler, transformed. Only includes files that | ||||
|         # were downloaded (or a download was attempted) | ||||
|         self.known_files: Set[PurePath] = set() | ||||
|  | ||||
|         self.added_files: Set[PurePath] = set() | ||||
|         self.changed_files: Set[PurePath] = set() | ||||
|         self.deleted_files: Set[PurePath] = set() | ||||
|         # Files that should have been deleted by the cleanup but weren't | ||||
|         self.not_deleted_files: Set[PurePath] = set() | ||||
|  | ||||
|         # Custom crawler-specific data | ||||
|         self.custom: Dict[str, Any] = dict() | ||||
|  | ||||
|         # Encountered errors and warnings | ||||
|         self.encountered_warnings: List[str] = [] | ||||
|         self.encountered_errors: List[str] = [] | ||||
|  | ||||
|     @staticmethod | ||||
|     def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: | ||||
|         result: Any = data.get(key, []) | ||||
|  | ||||
|         if not isinstance(result, list): | ||||
|             raise ReportLoadError(f"Incorrect format: {key!r} is not a list") | ||||
|  | ||||
|         for elem in result: | ||||
|             if not isinstance(elem, str): | ||||
|                 raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings") | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     @staticmethod | ||||
|     def _get_str_dictionary(data: Dict[str, Any], key: str) -> Dict[str, Any]: | ||||
|         result: Dict[str, Any] = data.get(key, {}) | ||||
|  | ||||
|         if not isinstance(result, dict): | ||||
|             raise ReportLoadError(f"Incorrect format: {key!r} is not a dictionary") | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     @classmethod | ||||
|     def load(cls, path: Path) -> "Report": | ||||
|         """ | ||||
|         May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError. | ||||
|         """ | ||||
|  | ||||
|         with open(path, encoding="utf-8") as f: | ||||
|             data = json.load(f) | ||||
|  | ||||
|         if not isinstance(data, dict): | ||||
|             raise ReportLoadError("Incorrect format: Root is not an object") | ||||
|  | ||||
|         self = cls() | ||||
|         for elem in self._get_list_of_strs(data, "found"): | ||||
|             self.found(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "reserved"): | ||||
|             self.mark_reserved(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "known"): | ||||
|             self.mark(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "added"): | ||||
|             self.add_file(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "changed"): | ||||
|             self.change_file(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "deleted"): | ||||
|             self.delete_file(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "not_deleted"): | ||||
|             self.not_delete_file(PurePath(elem)) | ||||
|         self.custom = self._get_str_dictionary(data, "custom") | ||||
|         self.encountered_errors = self._get_list_of_strs(data, "encountered_errors") | ||||
|         self.encountered_warnings = self._get_list_of_strs(data, "encountered_warnings") | ||||
|  | ||||
|         return self | ||||
|  | ||||
|     def store(self, path: Path) -> None: | ||||
|         """ | ||||
|         May raise OSError. | ||||
|         """ | ||||
|  | ||||
|         data = { | ||||
|             "found": [str(path) for path in sorted(self.found_paths)], | ||||
|             "reserved": [str(path) for path in sorted(self.reserved_files)], | ||||
|             "known": [str(path) for path in sorted(self.known_files)], | ||||
|             "added": [str(path) for path in sorted(self.added_files)], | ||||
|             "changed": [str(path) for path in sorted(self.changed_files)], | ||||
|             "deleted": [str(path) for path in sorted(self.deleted_files)], | ||||
|             "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], | ||||
|             "custom": self.custom, | ||||
|             "encountered_warnings": self.encountered_warnings, | ||||
|             "encountered_errors": self.encountered_errors, | ||||
|         } | ||||
|  | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             json.dump(data, f, indent=2, sort_keys=True) | ||||
|             f.write("\n")  # json.dump doesn't do this | ||||
|  | ||||
|     def found(self, path: PurePath) -> None: | ||||
|         self.found_paths.add(path) | ||||
|  | ||||
|     def mark_reserved(self, path: PurePath) -> None: | ||||
|         if path in self.marked: | ||||
|             raise RuntimeError("Trying to reserve an already reserved file") | ||||
|  | ||||
|         self.reserved_files.add(path) | ||||
|  | ||||
|     def mark(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Mark a previously unknown file as known. | ||||
|  | ||||
|         May throw a MarkDuplicateError or a MarkConflictError. For more detail, | ||||
|         see the respective exception's docstring. | ||||
|         """ | ||||
|  | ||||
|         for other in self.marked: | ||||
|             if path == other: | ||||
|                 raise MarkDuplicateError(path) | ||||
|  | ||||
|             if is_relative_to(path, other) or is_relative_to(other, path): | ||||
|                 raise MarkConflictError(path, other) | ||||
|  | ||||
|         self.known_files.add(path) | ||||
|  | ||||
|     @property | ||||
|     def marked(self) -> Set[PurePath]: | ||||
|         return self.known_files | self.reserved_files | ||||
|  | ||||
|     def is_marked(self, path: PurePath) -> bool: | ||||
|         return path in self.marked | ||||
|  | ||||
|     def add_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.added_files.add(path) | ||||
|  | ||||
|     def change_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.changed_files.add(path) | ||||
|  | ||||
|     def delete_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.deleted_files.add(path) | ||||
|  | ||||
|     def not_delete_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.not_deleted_files.add(path) | ||||
|  | ||||
|     def add_custom_value(self, key: str, value: Any) -> None: | ||||
|         """ | ||||
|         Adds a custom value under the passed key, overwriting any existing | ||||
|         """ | ||||
|         self.custom[key] = value | ||||
|  | ||||
|     def get_custom_value(self, key: str) -> Optional[Any]: | ||||
|         """ | ||||
|         Retrieves a custom value for the given key. | ||||
|         """ | ||||
|         return self.custom.get(key) | ||||
|  | ||||
|     def add_error(self, error: str) -> None: | ||||
|         """ | ||||
|         Adds an error to this report's error list. | ||||
|         """ | ||||
|         self.encountered_errors.append(error) | ||||
|  | ||||
|     def add_warning(self, warning: str) -> None: | ||||
|         """ | ||||
|         Adds a warning to this report's warning list. | ||||
|         """ | ||||
|         self.encountered_warnings.append(warning) | ||||
| @@ -1,79 +0,0 @@ | ||||
| """Helper functions and classes for temporary folders.""" | ||||
|  | ||||
| import logging | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from types import TracebackType | ||||
| from typing import Optional, Type | ||||
|  | ||||
| from .location import Location | ||||
|  | ||||
| LOGGER = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class TmpDir(Location): | ||||
|     """A temporary folder that can create files or nested temp folders.""" | ||||
|  | ||||
|     def __init__(self, path: Path): | ||||
|         """Create a new temporary folder for the given path.""" | ||||
|         super().__init__(path) | ||||
|         self._counter = 0 | ||||
|         self.cleanup() | ||||
|         self.path.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         """Format the folder as a string.""" | ||||
|         return f"Folder at {self.path}" | ||||
|  | ||||
|     def __enter__(self) -> 'TmpDir': | ||||
|         """Context manager entry function.""" | ||||
|         return self | ||||
|  | ||||
|     # pylint: disable=useless-return | ||||
|     def __exit__( | ||||
|             self, | ||||
|             exc_type: Optional[Type[BaseException]], | ||||
|             exc_value: Optional[BaseException], | ||||
|             traceback: Optional[TracebackType], | ||||
|     ) -> Optional[bool]: | ||||
|         """Context manager exit function. Calls cleanup().""" | ||||
|         self.cleanup() | ||||
|         return None | ||||
|  | ||||
|     def new_path(self, prefix: Optional[str] = None) -> Path: | ||||
|         """ | ||||
|         Return a unique path inside the directory. Doesn't create a file or | ||||
|         directory. | ||||
|         """ | ||||
|  | ||||
|         name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" | ||||
|  | ||||
|         LOGGER.debug("Creating temp file %s", name) | ||||
|  | ||||
|         return self.resolve(Path(name)) | ||||
|  | ||||
|     def new_subdir(self, prefix: Optional[str] = None) -> 'TmpDir': | ||||
|         """ | ||||
|         Create a new nested temporary folder and return it. | ||||
|         """ | ||||
|  | ||||
|         name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" | ||||
|         sub_path = self.resolve(Path(name)) | ||||
|         sub_path.mkdir(parents=True) | ||||
|  | ||||
|         LOGGER.debug("Creating temp dir %s at %s", name, sub_path) | ||||
|  | ||||
|         return TmpDir(sub_path) | ||||
|  | ||||
|     def cleanup(self) -> None: | ||||
|         """Delete this folder and all contained files.""" | ||||
|         LOGGER.debug("Deleting temp folder %s", self.path) | ||||
|  | ||||
|         if self.path.resolve().exists(): | ||||
|             shutil.rmtree(self.path.resolve()) | ||||
|  | ||||
|     def _inc_and_get_counter(self) -> int: | ||||
|         """Get and increment the counter by one.""" | ||||
|         counter = self._counter | ||||
|         self._counter += 1 | ||||
|         return counter | ||||
| @@ -1,127 +0,0 @@ | ||||
| """ | ||||
| Transforms let the user define functions to decide where the downloaded files | ||||
| should be placed locally. They let the user do more advanced things like moving | ||||
| only files whose names match a regex, or renaming files from one numbering | ||||
| scheme to another. | ||||
| """ | ||||
|  | ||||
| from dataclasses import dataclass | ||||
| from pathlib import PurePath | ||||
| from typing import Callable, List, Optional, TypeVar | ||||
|  | ||||
| from .utils import PathLike, Regex, to_path, to_pattern | ||||
|  | ||||
| Transform = Callable[[PurePath], Optional[PurePath]] | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Transformable: | ||||
|     """ | ||||
|     An object that can be transformed by a Transform. | ||||
|     """ | ||||
|  | ||||
|     path: PurePath | ||||
|  | ||||
|  | ||||
| TF = TypeVar("TF", bound=Transformable) | ||||
|  | ||||
|  | ||||
| def apply_transform( | ||||
|         transform: Transform, | ||||
|         transformables: List[TF], | ||||
| ) -> List[TF]: | ||||
|     """ | ||||
|     Apply a Transform to multiple Transformables, discarding those that were | ||||
|     not transformed by the Transform. | ||||
|     """ | ||||
|  | ||||
|     result: List[TF] = [] | ||||
|     for transformable in transformables: | ||||
|         new_path = transform(transformable.path) | ||||
|         if new_path: | ||||
|             transformable.path = new_path | ||||
|             result.append(transformable) | ||||
|     return result | ||||
|  | ||||
| # Transform combinators | ||||
|  | ||||
| keep = lambda path: path | ||||
|  | ||||
| def attempt(*args: Transform) -> Transform: | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         for transform in args: | ||||
|             result = transform(path) | ||||
|             if result: | ||||
|                 return result | ||||
|         return None | ||||
|     return inner | ||||
|  | ||||
| def optionally(transform: Transform) -> Transform: | ||||
|     return attempt(transform, lambda path: path) | ||||
|  | ||||
| def do(*args: Transform) -> Transform: | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         current = path | ||||
|         for transform in args: | ||||
|             result = transform(current) | ||||
|             if result: | ||||
|                 current = result | ||||
|             else: | ||||
|                 return None | ||||
|         return current | ||||
|     return inner | ||||
|  | ||||
| def predicate(pred: Callable[[PurePath], bool]) -> Transform: | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         if pred(path): | ||||
|             return path | ||||
|         return None | ||||
|     return inner | ||||
|  | ||||
| def glob(pattern: str) -> Transform: | ||||
|     return predicate(lambda path: path.match(pattern)) | ||||
|  | ||||
| def move_dir(source_dir: PathLike, target_dir: PathLike) -> Transform: | ||||
|     source_path = to_path(source_dir) | ||||
|     target_path = to_path(target_dir) | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         if source_path in path.parents: | ||||
|             return target_path / path.relative_to(source_path) | ||||
|         return None | ||||
|     return inner | ||||
|  | ||||
| def move(source: PathLike, target: PathLike) -> Transform: | ||||
|     source_path = to_path(source) | ||||
|     target_path = to_path(target) | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         if path == source_path: | ||||
|             return target_path | ||||
|         return None | ||||
|     return inner | ||||
|  | ||||
| def rename(source: str, target: str) -> Transform: | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         if path.name == source: | ||||
|             return path.with_name(target) | ||||
|         return None | ||||
|     return inner | ||||
|  | ||||
| def re_move(regex: Regex, target: str) -> Transform: | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         match = to_pattern(regex).fullmatch(str(path)) | ||||
|         if match: | ||||
|             groups = [match.group(0)] | ||||
|             groups.extend(match.groups()) | ||||
|             return PurePath(target.format(*groups)) | ||||
|         return None | ||||
|     return inner | ||||
|  | ||||
| def re_rename(regex: Regex, target: str) -> Transform: | ||||
|     def inner(path: PurePath) -> Optional[PurePath]: | ||||
|         match = to_pattern(regex).fullmatch(path.name) | ||||
|         if match: | ||||
|             groups = [match.group(0)] | ||||
|             groups.extend(match.groups()) | ||||
|             return path.with_name(target.format(*groups)) | ||||
|         return None | ||||
|     return inner | ||||
							
								
								
									
										439
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										439
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,439 @@ | ||||
| import ast | ||||
| import re | ||||
| from abc import ABC, abstractmethod | ||||
| from dataclasses import dataclass | ||||
| from enum import Enum | ||||
| from pathlib import PurePath | ||||
| from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union | ||||
|  | ||||
| from .logging import log | ||||
| from .utils import fmt_path, str_path | ||||
|  | ||||
|  | ||||
| class ArrowHead(Enum): | ||||
|     NORMAL = 0 | ||||
|     SEQUENCE = 1 | ||||
|  | ||||
|  | ||||
| class Ignore: | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Empty: | ||||
|     pass | ||||
|  | ||||
|  | ||||
| RightSide = Union[str, Ignore, Empty] | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Transformed: | ||||
|     path: PurePath | ||||
|  | ||||
|  | ||||
| class Ignored: | ||||
|     pass | ||||
|  | ||||
|  | ||||
| TransformResult = Optional[Union[Transformed, Ignored]] | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Rule: | ||||
|     left: str | ||||
|     left_index: int | ||||
|     name: str | ||||
|     head: ArrowHead | ||||
|     right: RightSide | ||||
|     right_index: int | ||||
|  | ||||
|     def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: | ||||
|         if isinstance(self.right, str): | ||||
|             return self.right | ||||
|         elif isinstance(self.right, Ignore): | ||||
|             return Ignored() | ||||
|         elif isinstance(self.right, Empty): | ||||
|             return Transformed(path) | ||||
|         else: | ||||
|             raise RuntimeError(f"Right side has invalid type {type(self.right)}") | ||||
|  | ||||
|  | ||||
| class Transformation(ABC): | ||||
|     def __init__(self, rule: Rule): | ||||
|         self.rule = rule | ||||
|  | ||||
|     @abstractmethod | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         pass | ||||
|  | ||||
|  | ||||
| class ExactTf(Transformation): | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         if path != PurePath(self.rule.left): | ||||
|             return None | ||||
|  | ||||
|         right = self.rule.right_result(path) | ||||
|         if not isinstance(right, str): | ||||
|             return right | ||||
|  | ||||
|         return Transformed(PurePath(right)) | ||||
|  | ||||
|  | ||||
| class ExactReTf(Transformation): | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         match = re.fullmatch(self.rule.left, str_path(path)) | ||||
|         if not match: | ||||
|             return None | ||||
|  | ||||
|         right = self.rule.right_result(path) | ||||
|         if not isinstance(right, str): | ||||
|             return right | ||||
|  | ||||
|         # For some reason, mypy thinks that "groups" has type List[str]. But | ||||
|         # since elements of "match.groups()" can be None, mypy is wrong. | ||||
|         groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) | ||||
|  | ||||
|         locals_dir: Dict[str, Union[str, int, float]] = {} | ||||
|         for i, group in enumerate(groups): | ||||
|             if group is None: | ||||
|                 continue | ||||
|  | ||||
|             locals_dir[f"g{i}"] = group | ||||
|  | ||||
|             try: | ||||
|                 locals_dir[f"i{i}"] = int(group) | ||||
|             except ValueError: | ||||
|                 pass | ||||
|  | ||||
|             try: | ||||
|                 locals_dir[f"f{i}"] = float(group) | ||||
|             except ValueError: | ||||
|                 pass | ||||
|  | ||||
|         result = eval(f"f{right!r}", {}, locals_dir) | ||||
|         return Transformed(PurePath(result)) | ||||
|  | ||||
|  | ||||
| class RenamingParentsTf(Transformation): | ||||
|     def __init__(self, sub_tf: Transformation): | ||||
|         super().__init__(sub_tf.rule) | ||||
|         self.sub_tf = sub_tf | ||||
|  | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         for i in range(len(path.parts), -1, -1): | ||||
|             parent = PurePath(*path.parts[:i]) | ||||
|             child = PurePath(*path.parts[i:]) | ||||
|  | ||||
|             transformed = self.sub_tf.transform(parent) | ||||
|             if not transformed: | ||||
|                 continue | ||||
|             elif isinstance(transformed, Transformed): | ||||
|                 return Transformed(transformed.path / child) | ||||
|             elif isinstance(transformed, Ignored): | ||||
|                 return transformed | ||||
|             else: | ||||
|                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||
|  | ||||
|         return None | ||||
|  | ||||
|  | ||||
| class RenamingPartsTf(Transformation): | ||||
|     def __init__(self, sub_tf: Transformation): | ||||
|         super().__init__(sub_tf.rule) | ||||
|         self.sub_tf = sub_tf | ||||
|  | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         result = PurePath() | ||||
|         any_part_matched = False | ||||
|         for part in path.parts: | ||||
|             transformed = self.sub_tf.transform(PurePath(part)) | ||||
|             if not transformed: | ||||
|                 result /= part | ||||
|             elif isinstance(transformed, Transformed): | ||||
|                 result /= transformed.path | ||||
|                 any_part_matched = True | ||||
|             elif isinstance(transformed, Ignored): | ||||
|                 return transformed | ||||
|             else: | ||||
|                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||
|  | ||||
|         if any_part_matched: | ||||
|             return Transformed(result) | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|  | ||||
| class RuleParseError(Exception): | ||||
|     def __init__(self, line: "Line", reason: str): | ||||
|         super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}") | ||||
|  | ||||
|         self.line = line | ||||
|         self.reason = reason | ||||
|  | ||||
|     def pretty_print(self) -> None: | ||||
|         log.error(f"Error parsing rule on line {self.line.line_nr}:") | ||||
|         log.error_contd(self.line.line) | ||||
|         spaces = " " * self.line.index | ||||
|         log.error_contd(f"{spaces}^--- {self.reason}") | ||||
|  | ||||
|  | ||||
| T = TypeVar("T") | ||||
|  | ||||
|  | ||||
| class Line: | ||||
|     def __init__(self, line: str, line_nr: int): | ||||
|         self._line = line | ||||
|         self._line_nr = line_nr | ||||
|         self._index = 0 | ||||
|  | ||||
|     @property | ||||
|     def line(self) -> str: | ||||
|         return self._line | ||||
|  | ||||
|     @property | ||||
|     def line_nr(self) -> int: | ||||
|         return self._line_nr | ||||
|  | ||||
|     @property | ||||
|     def index(self) -> int: | ||||
|         return self._index | ||||
|  | ||||
|     @index.setter | ||||
|     def index(self, index: int) -> None: | ||||
|         self._index = index | ||||
|  | ||||
|     @property | ||||
|     def rest(self) -> str: | ||||
|         return self.line[self.index:] | ||||
|  | ||||
|     def peek(self, amount: int = 1) -> str: | ||||
|         return self.rest[:amount] | ||||
|  | ||||
|     def take(self, amount: int = 1) -> str: | ||||
|         string = self.peek(amount) | ||||
|         self.index += len(string) | ||||
|         return string | ||||
|  | ||||
|     def expect(self, string: str) -> str: | ||||
|         if self.peek(len(string)) == string: | ||||
|             return self.take(len(string)) | ||||
|         else: | ||||
|             raise RuleParseError(self, f"Expected {string!r}") | ||||
|  | ||||
|     def expect_with(self, string: str, value: T) -> T: | ||||
|         self.expect(string) | ||||
|         return value | ||||
|  | ||||
|     def one_of(self, parsers: List[Callable[[], T]], description: str) -> T: | ||||
|         for parser in parsers: | ||||
|             index = self.index | ||||
|             try: | ||||
|                 return parser() | ||||
|             except RuleParseError: | ||||
|                 self.index = index | ||||
|  | ||||
|         raise RuleParseError(self, description) | ||||
|  | ||||
|  | ||||
| # RULE = LEFT SPACE '-' NAME '-' HEAD (SPACE RIGHT)? | ||||
| # SPACE = ' '+ | ||||
| # NAME = '' | 'exact' | 'name' | 're' | 'exact-re' | 'name-re' | ||||
| # HEAD = '>' | '>>' | ||||
| # LEFT = STR | QUOTED_STR | ||||
| # RIGHT = STR | QUOTED_STR | '!' | ||||
|  | ||||
|  | ||||
| def parse_zero_or_more_spaces(line: Line) -> None: | ||||
|     while line.peek() == " ": | ||||
|         line.take() | ||||
|  | ||||
|  | ||||
| def parse_one_or_more_spaces(line: Line) -> None: | ||||
|     line.expect(" ") | ||||
|     parse_zero_or_more_spaces(line) | ||||
|  | ||||
|  | ||||
| def parse_str(line: Line) -> str: | ||||
|     result = [] | ||||
|     while c := line.peek(): | ||||
|         if c == " ": | ||||
|             break | ||||
|         else: | ||||
|             line.take() | ||||
|             result.append(c) | ||||
|  | ||||
|     if result: | ||||
|         return "".join(result) | ||||
|     else: | ||||
|         raise RuleParseError(line, "Expected non-space character") | ||||
|  | ||||
|  | ||||
| QUOTATION_MARKS = {'"', "'"} | ||||
|  | ||||
|  | ||||
| def parse_quoted_str(line: Line) -> str: | ||||
|     escaped = False | ||||
|  | ||||
|     # Points to first character of string literal | ||||
|     start_index = line.index | ||||
|  | ||||
|     quotation_mark = line.peek() | ||||
|     if quotation_mark not in QUOTATION_MARKS: | ||||
|         raise RuleParseError(line, "Expected quotation mark") | ||||
|     line.take() | ||||
|  | ||||
|     while c := line.peek(): | ||||
|         if escaped: | ||||
|             escaped = False | ||||
|             line.take() | ||||
|         elif c == quotation_mark: | ||||
|             line.take() | ||||
|             stop_index = line.index | ||||
|             literal = line.line[start_index:stop_index] | ||||
|             try: | ||||
|                 return ast.literal_eval(literal) | ||||
|             except SyntaxError as e: | ||||
|                 line.index = start_index | ||||
|                 raise RuleParseError(line, str(e)) from e | ||||
|         elif c == "\\": | ||||
|             escaped = True | ||||
|             line.take() | ||||
|         else: | ||||
|             line.take() | ||||
|  | ||||
|     raise RuleParseError(line, "Expected end of string literal") | ||||
|  | ||||
|  | ||||
| def parse_left(line: Line) -> str: | ||||
|     if line.peek() in QUOTATION_MARKS: | ||||
|         return parse_quoted_str(line) | ||||
|     else: | ||||
|         return parse_str(line) | ||||
|  | ||||
|  | ||||
| def parse_right(line: Line) -> Union[str, Ignore]: | ||||
|     c = line.peek() | ||||
|     if c in QUOTATION_MARKS: | ||||
|         return parse_quoted_str(line) | ||||
|     else: | ||||
|         string = parse_str(line) | ||||
|         if string == "!": | ||||
|             return Ignore() | ||||
|         return string | ||||
|  | ||||
|  | ||||
| def parse_arrow_name(line: Line) -> str: | ||||
|     return line.one_of([ | ||||
|         lambda: line.expect("exact-re"), | ||||
|         lambda: line.expect("exact"), | ||||
|         lambda: line.expect("name-re"), | ||||
|         lambda: line.expect("name"), | ||||
|         lambda: line.expect("re"), | ||||
|         lambda: line.expect(""), | ||||
|     ], "Expected arrow name") | ||||
|  | ||||
|  | ||||
| def parse_arrow_head(line: Line) -> ArrowHead: | ||||
|     return line.one_of([ | ||||
|         lambda: line.expect_with(">>", ArrowHead.SEQUENCE), | ||||
|         lambda: line.expect_with(">", ArrowHead.NORMAL), | ||||
|     ], "Expected arrow head") | ||||
|  | ||||
|  | ||||
| def parse_eol(line: Line) -> None: | ||||
|     if line.peek(): | ||||
|         raise RuleParseError(line, "Expected end of line") | ||||
|  | ||||
|  | ||||
| def parse_rule(line: Line) -> Rule: | ||||
|     parse_zero_or_more_spaces(line) | ||||
|     left_index = line.index | ||||
|     left = parse_left(line) | ||||
|  | ||||
|     parse_one_or_more_spaces(line) | ||||
|  | ||||
|     line.expect("-") | ||||
|     name = parse_arrow_name(line) | ||||
|     line.expect("-") | ||||
|     head = parse_arrow_head(line) | ||||
|  | ||||
|     right_index = line.index | ||||
|     right: RightSide | ||||
|     try: | ||||
|         parse_zero_or_more_spaces(line) | ||||
|         parse_eol(line) | ||||
|         right = Empty() | ||||
|     except RuleParseError: | ||||
|         line.index = right_index | ||||
|         parse_one_or_more_spaces(line) | ||||
|         right = parse_right(line) | ||||
|         parse_eol(line) | ||||
|  | ||||
|     return Rule(left, left_index, name, head, right, right_index) | ||||
|  | ||||
|  | ||||
| def parse_transformation(line: Line) -> Transformation: | ||||
|     rule = parse_rule(line) | ||||
|  | ||||
|     if rule.name == "": | ||||
|         return RenamingParentsTf(ExactTf(rule)) | ||||
|     elif rule.name == "exact": | ||||
|         return ExactTf(rule) | ||||
|     elif rule.name == "name": | ||||
|         if len(PurePath(rule.left).parts) > 1: | ||||
|             line.index = rule.left_index | ||||
|             raise RuleParseError(line, "Expected name, not multiple segments") | ||||
|         return RenamingPartsTf(ExactTf(rule)) | ||||
|     elif rule.name == "re": | ||||
|         return RenamingParentsTf(ExactReTf(rule)) | ||||
|     elif rule.name == "exact-re": | ||||
|         return ExactReTf(rule) | ||||
|     elif rule.name == "name-re": | ||||
|         return RenamingPartsTf(ExactReTf(rule)) | ||||
|     else: | ||||
|         raise RuntimeError(f"Invalid arrow name {rule.name!r}") | ||||
|  | ||||
|  | ||||
| class Transformer: | ||||
|     def __init__(self, rules: str): | ||||
|         """ | ||||
|         May throw a RuleParseException. | ||||
|         """ | ||||
|  | ||||
|         self._tfs = [] | ||||
|         for i, line in enumerate(rules.split("\n")): | ||||
|             line = line.strip() | ||||
|             if line: | ||||
|                 tf = parse_transformation(Line(line, i)) | ||||
|                 self._tfs.append((line, tf)) | ||||
|  | ||||
|     def transform(self, path: PurePath) -> Optional[PurePath]: | ||||
|         for i, (line, tf) in enumerate(self._tfs): | ||||
|             log.explain(f"Testing rule {i+1}: {line}") | ||||
|  | ||||
|             try: | ||||
|                 result = tf.transform(path) | ||||
|             except Exception as e: | ||||
|                 log.warn(f"Error while testing rule {i+1}: {line}") | ||||
|                 log.warn_contd(str(e)) | ||||
|                 continue | ||||
|  | ||||
|             if not result: | ||||
|                 continue | ||||
|  | ||||
|             if isinstance(result, Ignored): | ||||
|                 log.explain("Match found, path ignored") | ||||
|                 return None | ||||
|  | ||||
|             if tf.rule.head == ArrowHead.NORMAL: | ||||
|                 log.explain(f"Match found, transformed path to {fmt_path(result.path)}") | ||||
|                 path = result.path | ||||
|                 break | ||||
|             elif tf.rule.head == ArrowHead.SEQUENCE: | ||||
|                 log.explain(f"Match found, updated path to {fmt_path(result.path)}") | ||||
|                 path = result.path | ||||
|             else: | ||||
|                 raise RuntimeError(f"Invalid transform result of type {type(result)}: {result}") | ||||
|  | ||||
|         log.explain(f"Final result: {fmt_path(path)}") | ||||
|         return path | ||||
							
								
								
									
										53
									
								
								PFERD/update.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								PFERD/update.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,53 @@ | ||||
| from dataclasses import dataclass | ||||
| import ssl | ||||
| from typing import Optional | ||||
| import aiohttp | ||||
| import certifi | ||||
|  | ||||
| from .version import NAME, VERSION | ||||
| from .logging import log | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class PferdUpdate: | ||||
|     release_url: str | ||||
|     version: str | ||||
|  | ||||
|  | ||||
| def _build_session() -> aiohttp.ClientSession: | ||||
|     return aiohttp.ClientSession( | ||||
|         headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||
|         connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||
|         timeout=aiohttp.ClientTimeout( | ||||
|             total=15 * 60, | ||||
|             connect=10, | ||||
|             sock_connect=10, | ||||
|             sock_read=10, | ||||
|         ) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| async def check_for_updates() -> None: | ||||
|     if new_version := await get_newer_version(): | ||||
|         log.warn( | ||||
|             f"{NAME} version out of date. " | ||||
|             + f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub." | ||||
|         ) | ||||
|         log.warn_contd(f"You can download it on GitHub: {new_version.release_url}") | ||||
|     else: | ||||
|         log.explain("No update found") | ||||
|  | ||||
|  | ||||
| async def get_newer_version() -> Optional[PferdUpdate]: | ||||
|     async with _build_session() as session: | ||||
|         async with session.get( | ||||
|             "https://api.github.com/repos/Garmelon/Pferd/releases/latest", | ||||
|             headers={"Accept": "application/vnd.github+json"} | ||||
|         ) as response: | ||||
|             release_information = await response.json() | ||||
|             tag_name: str = release_information["tag_name"] | ||||
|             tag_name = tag_name.removeprefix("v") | ||||
|             if VERSION == tag_name: | ||||
|                 return None | ||||
|  | ||||
|             return PferdUpdate(release_url=release_information["html_url"], version=tag_name) | ||||
							
								
								
									
										208
									
								
								PFERD/utils.py
									
									
									
									
									
								
							
							
						
						
									
										208
									
								
								PFERD/utils.py
									
									
									
									
									
								
							| @@ -1,98 +1,144 @@ | ||||
| """ | ||||
| A few utility bobs and bits. | ||||
| """ | ||||
|  | ||||
| import re | ||||
| import asyncio | ||||
| import getpass | ||||
| import sys | ||||
| import threading | ||||
| from abc import ABC, abstractmethod | ||||
| from contextlib import AsyncExitStack | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Optional, Tuple, Union | ||||
| from types import TracebackType | ||||
| from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar | ||||
| from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from .progress import ProgressSettings, progress_for, size_from_headers | ||||
|  | ||||
| PathLike = Union[PurePath, str, Tuple[str, ...]] | ||||
| T = TypeVar("T") | ||||
|  | ||||
|  | ||||
| def to_path(pathlike: PathLike) -> Path: | ||||
| async def in_daemon_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: | ||||
|     loop = asyncio.get_running_loop() | ||||
|     future: asyncio.Future[T] = asyncio.Future() | ||||
|  | ||||
|     def thread_func() -> None: | ||||
|         result = func() | ||||
|         loop.call_soon_threadsafe(future.set_result, result) | ||||
|  | ||||
|     threading.Thread(target=thread_func, daemon=True).start() | ||||
|  | ||||
|     return await future | ||||
|  | ||||
|  | ||||
| async def ainput(prompt: str) -> str: | ||||
|     return await in_daemon_thread(lambda: input(prompt)) | ||||
|  | ||||
|  | ||||
| async def agetpass(prompt: str) -> str: | ||||
|     return await in_daemon_thread(lambda: getpass.getpass(prompt)) | ||||
|  | ||||
|  | ||||
| async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: | ||||
|     """ | ||||
|     Convert a given PathLike into a Path. | ||||
|     """ | ||||
|     if isinstance(pathlike, tuple): | ||||
|         return Path(*pathlike) | ||||
|     return Path(pathlike) | ||||
|  | ||||
|  | ||||
| Regex = Union[str, re.Pattern] | ||||
|  | ||||
|  | ||||
| def to_pattern(regex: Regex) -> re.Pattern: | ||||
|     """ | ||||
|     Convert a regex to a re.Pattern. | ||||
|     """ | ||||
|     if isinstance(regex, re.Pattern): | ||||
|         return regex | ||||
|     return re.compile(regex) | ||||
|  | ||||
|  | ||||
| def soupify(response: requests.Response) -> bs4.BeautifulSoup: | ||||
|     """ | ||||
|     Wrap a requests response in a bs4 object. | ||||
|     """ | ||||
|  | ||||
|     return bs4.BeautifulSoup(response.text, "html.parser") | ||||
|  | ||||
|  | ||||
| def stream_to_path( | ||||
|         response: requests.Response, | ||||
|         target: Path, | ||||
|         progress_name: Optional[str] = None, | ||||
|         chunk_size: int = 1024 ** 2 | ||||
| ) -> None: | ||||
|     """ | ||||
|     Download a requests response content to a file by streaming it. This | ||||
|     function avoids excessive memory usage when downloading large files. The | ||||
|     chunk_size is in bytes. | ||||
|  | ||||
|     If progress_name is None, no progress bar will be shown. Otherwise a progress | ||||
|     bar will appear, if the download is bigger than an internal threshold. | ||||
|     """ | ||||
|  | ||||
|     with response: | ||||
|         length = size_from_headers(response) | ||||
|         if progress_name and length and int(length) > 1024 * 1024 * 10:  # 10 MiB | ||||
|             settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length) | ||||
|         else: | ||||
|             settings = None | ||||
|  | ||||
|         with open(target, 'wb') as file_descriptor: | ||||
|             with progress_for(settings) as progress: | ||||
|                 for chunk in response.iter_content(chunk_size=chunk_size): | ||||
|                     file_descriptor.write(chunk) | ||||
|                     progress.advance(len(chunk)) | ||||
|  | ||||
|  | ||||
| def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool: | ||||
|     """ | ||||
|     Prompts the user a yes/no question and returns their choice. | ||||
|     Asks the user a yes/no question and returns their choice. | ||||
|     """ | ||||
|  | ||||
|     if default is True: | ||||
|         prompt = "[Y/n]" | ||||
|         query += " [Y/n] " | ||||
|     elif default is False: | ||||
|         prompt = "[y/N]" | ||||
|         query += " [y/N] " | ||||
|     else: | ||||
|         prompt = "[y/n]" | ||||
|  | ||||
|     text = f"{question} {prompt} " | ||||
|     wrong_reply = "Please reply with 'yes'/'y' or 'no'/'n'." | ||||
|         query += " [y/n] " | ||||
|  | ||||
|     while True: | ||||
|         response = input(text).strip().lower() | ||||
|         if response in {"yes", "ye", "y"}: | ||||
|         response = (await ainput(query)).strip().lower() | ||||
|         if response == "y": | ||||
|             return True | ||||
|         if response in {"no", "n"}: | ||||
|         elif response == "n": | ||||
|             return False | ||||
|         if response == "" and default is not None: | ||||
|         elif response == "" and default is not None: | ||||
|             return default | ||||
|         print(wrong_reply) | ||||
|  | ||||
|         print("Please answer with 'y' or 'n'.") | ||||
|  | ||||
|  | ||||
| def soupify(data: bytes) -> bs4.BeautifulSoup: | ||||
|     """ | ||||
|     Parses HTML to a beautifulsoup object. | ||||
|     """ | ||||
|  | ||||
|     return bs4.BeautifulSoup(data, "html.parser") | ||||
|  | ||||
|  | ||||
| def url_set_query_param(url: str, param: str, value: str) -> str: | ||||
|     """ | ||||
|     Set a query parameter in an url, overwriting existing ones with the same name. | ||||
|     """ | ||||
|     scheme, netloc, path, query, fragment = urlsplit(url) | ||||
|     query_parameters = parse_qs(query) | ||||
|     query_parameters[param] = [value] | ||||
|     new_query_string = urlencode(query_parameters, doseq=True) | ||||
|  | ||||
|     return urlunsplit((scheme, netloc, path, new_query_string, fragment)) | ||||
|  | ||||
|  | ||||
| def url_set_query_params(url: str, params: Dict[str, str]) -> str: | ||||
|     """ | ||||
|     Sets multiple query parameters in an url, overwriting existing ones. | ||||
|     """ | ||||
|     result = url | ||||
|  | ||||
|     for key, val in params.items(): | ||||
|         result = url_set_query_param(result, key, val) | ||||
|  | ||||
|     return result | ||||
|  | ||||
|  | ||||
| def str_path(path: PurePath) -> str: | ||||
|     if not path.parts: | ||||
|         return "." | ||||
|     return "/".join(path.parts) | ||||
|  | ||||
|  | ||||
| def fmt_path(path: PurePath) -> str: | ||||
|     return repr(str_path(path)) | ||||
|  | ||||
|  | ||||
| def fmt_real_path(path: Path) -> str: | ||||
|     return repr(str(path.absolute())) | ||||
|  | ||||
|  | ||||
| class ReusableAsyncContextManager(ABC, Generic[T]): | ||||
|     def __init__(self) -> None: | ||||
|         self._active = False | ||||
|         self._stack = AsyncExitStack() | ||||
|  | ||||
|     @abstractmethod | ||||
|     async def _on_aenter(self) -> T: | ||||
|         pass | ||||
|  | ||||
|     async def __aenter__(self) -> T: | ||||
|         if self._active: | ||||
|             raise RuntimeError("Nested or otherwise concurrent usage is not allowed") | ||||
|  | ||||
|         self._active = True | ||||
|         await self._stack.__aenter__() | ||||
|  | ||||
|         # See https://stackoverflow.com/a/13075071 | ||||
|         try: | ||||
|             result: T = await self._on_aenter() | ||||
|         except:  # noqa: E722 do not use bare 'except' | ||||
|             if not await self.__aexit__(*sys.exc_info()): | ||||
|                 raise | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     async def __aexit__( | ||||
|             self, | ||||
|             exc_type: Optional[Type[BaseException]], | ||||
|             exc_value: Optional[BaseException], | ||||
|             traceback: Optional[TracebackType], | ||||
|     ) -> Optional[bool]: | ||||
|         if not self._active: | ||||
|             raise RuntimeError("__aexit__ called too many times") | ||||
|  | ||||
|         result = await self._stack.__aexit__(exc_type, exc_value, traceback) | ||||
|         self._active = False | ||||
|         return result | ||||
|   | ||||
							
								
								
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| NAME = "PFERD" | ||||
| VERSION = "3.4.1" | ||||
							
								
								
									
										345
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										345
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,254 +2,143 @@ | ||||
|  | ||||
| **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien | ||||
|  | ||||
| - [Quickstart with `sync_url`](#quickstart-with-sync_url) | ||||
| - [Installation](#installation) | ||||
|     - [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210) | ||||
| - [Example setup](#example-setup) | ||||
| - [Usage](#usage) | ||||
|     - [General concepts](#general-concepts) | ||||
|     - [Constructing transforms](#constructing-transforms) | ||||
|         - [Transform creators](#transform-creators) | ||||
|         - [Transform combinators](#transform-combinators) | ||||
|     - [A short, but commented example](#a-short-but-commented-example) | ||||
| Other resources: | ||||
|  | ||||
| ## Quickstart with `sync_url` | ||||
|  | ||||
| The `sync_url` program allows you to just synchronize a given ILIAS URL (of a | ||||
| course, a folder, your personal desktop, etc.) without any extra configuration | ||||
| or setting up. Download the program, open ILIAS, copy the URL from the address | ||||
| bar and pass it to sync_url. | ||||
|  | ||||
| It bundles everything it needs in one executable and is easy to | ||||
| use, but doesn't expose all the configuration options and tweaks a full install | ||||
| does. | ||||
|  | ||||
| 1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest). | ||||
| 2. Recognize that you most likely need to enclose the URL in `""` quotes to prevent your shell from interpreting `&` and other symbols | ||||
| 3. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option.   | ||||
|   If you are on **Linux/Mac**, you need to *make the file executable* using `chmod +x <file>`.   | ||||
|   If you are on **Mac**, you need to allow this unverified program to run (see e.g. [here](https://www.switchingtomac.com/tutorials/osx/how-to-run-unverified-apps-on-macos/)) | ||||
| - [Config file format](CONFIG.md) | ||||
| - [Changelog](CHANGELOG.md) | ||||
| - [Development Guide](DEV.md) | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| Ensure that you have at least Python 3.8 installed. | ||||
| ### Direct download | ||||
|  | ||||
| Binaries for Linux, Windows and Mac can be downloaded directly from the | ||||
| [latest release](https://github.com/Garmelon/PFERD/releases/latest). | ||||
|  | ||||
| ### With pip | ||||
|  | ||||
| Ensure you have at least Python 3.9 installed. Run the following command to | ||||
| install PFERD or upgrade it to the latest version: | ||||
|  | ||||
| To install PFERD or update your installation to the latest version, run this | ||||
| wherever you want to install or have already installed PFERD: | ||||
| ``` | ||||
| $ pip install git+https://github.com/Garmelon/PFERD@v2.4.2 | ||||
| $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest | ||||
| ``` | ||||
|  | ||||
| The use of [venv] is recommended. | ||||
| The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. | ||||
|  | ||||
| [venv]: https://docs.python.org/3/library/venv.html | ||||
| ### With package managers | ||||
|  | ||||
| ### Upgrading from 2.0.0 to 2.1.0+ | ||||
| Unofficial packages are available for: | ||||
| - [AUR](https://aur.archlinux.org/packages/pferd) | ||||
| - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) | ||||
|  | ||||
| - The `IliasDirectoryType` type was renamed to `IliasElementType` and is now far more detailed. | ||||
|   The new values are: `REGULAR_FOLDER`, `VIDEO_FOLDER`, `EXERCISE_FOLDER`, `REGULAR_FILE`, `VIDEO_FILE`, `FORUM`, `EXTERNAL_LINK`. | ||||
| - Forums and external links are skipped automatically if you use the `kit_ilias` helper. | ||||
| See also PFERD's [repology page](https://repology.org/project/pferd/versions). | ||||
|  | ||||
| ## Example setup | ||||
| ## Basic usage | ||||
|  | ||||
| In this example, `python3` refers to at least Python 3.8. | ||||
| PFERD can be run directly from the command line with no config file. Run `pferd | ||||
| -h` to get an overview of available commands and options. Run `pferd <command> | ||||
| -h` to see which options a command has. | ||||
|  | ||||
| For example, you can download your personal desktop from the KIT ILIAS like | ||||
| this: | ||||
|  | ||||
| A full example setup and initial use could look like: | ||||
| ``` | ||||
| $ mkdir Vorlesungen | ||||
| $ cd Vorlesungen | ||||
| $ python3 -m venv .venv | ||||
| $ .venv/bin/activate | ||||
| $ pip install git+https://github.com/Garmelon/PFERD@v2.4.2 | ||||
| $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.2/example_config.py | ||||
| $ python3 example_config.py | ||||
| $ deactivate | ||||
| $ pferd kit-ilias-web desktop <output_directory> | ||||
| ``` | ||||
|  | ||||
| Subsequent runs of the program might look like: | ||||
| Also, you can download most ILIAS pages directly like this: | ||||
|  | ||||
| ``` | ||||
| $ cd Vorlesungen | ||||
| $ .venv/bin/activate | ||||
| $ python3 example_config.py | ||||
| $ deactivate | ||||
| $ pferd kit-ilias-web <url> <output_directory> | ||||
| ``` | ||||
|  | ||||
| If you just want to get started and crawl *your entire ILIAS Desktop* instead | ||||
| of a given set of courses, please replace `example_config.py` with | ||||
| `example_config_personal_desktop.py` in all of the instructions below (`curl` call and | ||||
| `python3` run command). | ||||
| However, the CLI only lets you download a single thing at a time, and the | ||||
| resulting command can grow long quite quickly. Because of this, PFERD can also | ||||
| be used with a config file. | ||||
|  | ||||
| ## Usage | ||||
| To get started, just take a command you've been using and add `--dump-config` | ||||
| directly after `pferd`, like this: | ||||
|  | ||||
| ### General concepts | ||||
|  | ||||
| A PFERD config is a normal python file that starts multiple *synchronizers* | ||||
| which do all the heavy lifting. While you can create and wire them up manually, | ||||
| you are encouraged to use the helper methods provided in `PFERD.Pferd`. | ||||
|  | ||||
| The synchronizers take some input arguments specific to their service and a | ||||
| *transform*. The transform receives the computed path of an element in ILIAS and | ||||
| can return either an output path (so you can rename files or move them around as | ||||
| you wish) or `None` if you do not want to save the given file. | ||||
|  | ||||
| Additionally the ILIAS synchronizer allows you to define a *crawl filter*. This | ||||
| filter also receives the computed path as the input, but is only called for | ||||
| *directories*. If you return `True`, the directory will be crawled and | ||||
| searched. If you return `False` the directory will be ignored and nothing in it | ||||
| will be passed to the transform. | ||||
|  | ||||
| ### Constructing transforms | ||||
|  | ||||
| While transforms are just normal python functions, writing them by hand can | ||||
| quickly become tedious. In order to help you with writing your own transforms | ||||
| and filters, PFERD defines a few useful transform creators and combinators in | ||||
| the `PFERD.transform` module: | ||||
|  | ||||
| #### Transform creators | ||||
|  | ||||
| These methods let you create a few basic transform building blocks: | ||||
|  | ||||
| - **`glob(glob)`**   | ||||
|   Creates a transform that returns the unchanged path if the glob matches the path and `None` otherwise. | ||||
|   See also [Path.match].   | ||||
|   Example: `glob("Übung/*.pdf")` | ||||
| - **`predicate(pred)`**   | ||||
|   Creates a transform that returns the unchanged path if `pred(path)` returns a truthy value. | ||||
|   Returns `None` otherwise.   | ||||
|   Example: `predicate(lambda path: len(path.parts) == 3)` | ||||
| - **`move_dir(source, target)`**   | ||||
|   Creates a transform that moves all files from the `source` to the `target` directory.   | ||||
|   Example: `move_dir("Übung/", "Blätter/")` | ||||
| - **`move(source, target)`**   | ||||
|   Creates a transform that moves the `source` file to `target`.   | ||||
|   Example: `move("Vorlesung/VL02_Automten.pdf", "Vorlesung/VL02_Automaten.pdf")` | ||||
| - **`rename(source, target)`**   | ||||
|   Creates a transform that renames all files named `source` to `target`. | ||||
|   This transform works on the file names, not paths, and thus works no matter where the file is located.   | ||||
|   Example: `rename("VL02_Automten.pdf", "VL02_Automaten.pdf")` | ||||
| - **`re_move(regex, target)`**   | ||||
|   Creates a transform that moves all files matching `regex` to `target`. | ||||
|   The transform `str.format` on the `target` string with the contents of the capturing groups before returning it. | ||||
|   The capturing groups can be accessed via their index. | ||||
|   See also [Match.group].   | ||||
|   Example: `re_move(r"Übung/Blatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf")` | ||||
| - **`re_rename(regex, target)`**   | ||||
|   Creates a transform that renames all files matching `regex` to `target`. | ||||
|   This transform works on the file names, not paths, and thus works no matter where the file is located.   | ||||
|   Example: `re_rename(r"VL(\d+)(.*)\.pdf", "Vorlesung_Nr_{1}__{2}.pdf")` | ||||
|  | ||||
| All movement or rename transforms above return `None` if a file doesn't match | ||||
| their movement or renaming criteria. This enables them to be used as building | ||||
| blocks to build up more complex transforms. | ||||
|  | ||||
| In addition, `PFERD.transform` also defines the `keep` transform which returns its input path unchanged. | ||||
| This behaviour can be very useful when creating more complex transforms. | ||||
| See below for example usage. | ||||
|  | ||||
| [Path.match]: https://docs.python.org/3/library/pathlib.html#pathlib.Path.match | ||||
| [Match.group]: https://docs.python.org/3/library/re.html#re.Match.group | ||||
|  | ||||
| #### Transform combinators | ||||
|  | ||||
| These methods let you combine transforms into more complex transforms: | ||||
|  | ||||
| - **`optionally(transform)`**   | ||||
|   Wraps a given transform and returns its result if it is not `None`. | ||||
|   Otherwise returns the input path unchanged. | ||||
|   See below for example usage. | ||||
| * **`do(transforms)`**   | ||||
|   Accepts a series of transforms and applies them in the given order to the result of the previous one. | ||||
|   If any transform returns `None`, `do` short-circuits and also returns `None`. | ||||
|   This can be used to perform multiple renames in a row: | ||||
|   ```py | ||||
|   do( | ||||
|       # Move them | ||||
|       move_dir("Vorlesungsmaterial/Vorlesungsvideos/", "Vorlesung/Videos/"), | ||||
|       # Fix extensions (if they have any) | ||||
|       optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")), | ||||
|       # Remove the 'dbs' prefix (if they have any) | ||||
|       optionally(re_rename("(?i)dbs-(.+)", "{1}")), | ||||
|   ) | ||||
|   ``` | ||||
| - **`attempt(transforms)`**   | ||||
|   Applies the passed transforms in the given order until it finds one that does not return `None`. | ||||
|   If it does not find any, it returns `None`. | ||||
|   This can be used to give a list of possible transformations and automatically pick the first one that fits: | ||||
|   ```py | ||||
|   attempt( | ||||
|       # Move all videos. If a video is passed in, this `re_move` will succeed | ||||
|       # and attempt short-circuits with the result. | ||||
|       re_move(r"Vorlesungsmaterial/.*/(.+?)\.mp4", "Vorlesung/Videos/{1}.mp4"), | ||||
|       # Move the whole folder to a nicer name - now without any mp4! | ||||
|       move_dir("Vorlesungsmaterial/", "Vorlesung/"), | ||||
|       # If we got another file, keep it. | ||||
|       keep, | ||||
|   ) | ||||
|   ``` | ||||
|  | ||||
| All of these combinators are used in the provided example configs, if you want | ||||
| to see some more real-life usages. | ||||
|  | ||||
| ### A short, but commented example | ||||
|  | ||||
| ```py | ||||
| from pathlib import Path, PurePath | ||||
| from PFERD import Pferd | ||||
| from PFERD.ilias import IliasElementType | ||||
| from PFERD.transform import * | ||||
|  | ||||
| # This filter will later be used by the ILIAS crawler to decide whether it | ||||
| # should crawl a directory (or directory-like structure). | ||||
| def filter_course(path: PurePath, type: IliasElementType) -> bool: | ||||
|     # Note that glob returns a Transform, which is a function from PurePath -> | ||||
|     # Optional[PurePath]. Because of this, we need to apply the result of | ||||
|     # 'glob' to our input path. The returned value will be truthy (a Path) if | ||||
|     # the transform succeeded, or `None` if it failed. | ||||
|  | ||||
|     # We need to crawl the 'Tutorien' folder as it contains one that we want. | ||||
|     if glob("Tutorien/")(path): | ||||
|         return True | ||||
|     # If we found 'Tutorium 10', keep it! | ||||
|     if glob("Tutorien/Tutorium 10")(path): | ||||
|         return True | ||||
|     # Discard all other folders inside 'Tutorien' | ||||
|     if glob("Tutorien/*")(path): | ||||
|         return False | ||||
|  | ||||
|     # All other dirs (including subdirs of 'Tutorium 10') should be searched :) | ||||
|     return True | ||||
|  | ||||
|  | ||||
| # This transform will later be used to rename a few files. It can also be used | ||||
| # to ignore some files. | ||||
| transform_course = attempt( | ||||
|     # We don't care about the other tuts and would instead prefer a cleaner | ||||
|     # directory structure. | ||||
|     move_dir("Tutorien/Tutorium 10/", "Tutorium/"), | ||||
|     # We don't want to modify any other files, so we're going to keep them | ||||
|     # exactly as they are. | ||||
|     keep | ||||
| ) | ||||
|  | ||||
| # Enable and configure the text output. Needs to be called before calling any | ||||
| # other PFERD methods. | ||||
| Pferd.enable_logging() | ||||
| # Create a Pferd instance rooted in the same directory as the script file. This | ||||
| # is not a test run, so files will be downloaded (default, can be omitted). | ||||
| pferd = Pferd(Path(__file__).parent, test_run=False) | ||||
|  | ||||
| # Use the ilias_kit helper to synchronize an ILIAS course | ||||
| pferd.ilias_kit( | ||||
|     # The directory that all of the downloaded files should be placed in | ||||
|     "My_cool_course/", | ||||
|     # The course ID (found in the URL when on the course page in ILIAS) | ||||
|     "course id", | ||||
|     # A path to a cookie jar. If you synchronize multiple ILIAS courses, | ||||
|     # setting this to a common value requires you to only log in once. | ||||
|     cookies=Path("ilias_cookies.txt"), | ||||
|     # A transform can rename, move or filter out certain files | ||||
|     transform=transform_course, | ||||
|     # A crawl filter limits what paths the cralwer searches | ||||
|     dir_filter=filter_course, | ||||
| ) | ||||
| ``` | ||||
| $ pferd --dump-config kit-ilias-web <url> <output_directory> | ||||
| ``` | ||||
|  | ||||
| This will make PFERD write its current configuration to its default config file | ||||
| path. You can then run `pferd` without a command and it will execute the config | ||||
| file. Alternatively, you can use `--dump-config-to` and specify a path yourself. | ||||
| Using `--dump-config-to -` will print the configuration to stdout instead of a | ||||
| file, which is a good way to see what is actually going on when using a CLI | ||||
| command. | ||||
|  | ||||
| Another good way to see what PFERD is doing is the `--explain` option. When | ||||
| enabled, PFERD explains in detail what it is doing and why. This can help with | ||||
| debugging your own config. | ||||
|  | ||||
| If you don't want to run all crawlers from your config file, you can specify the | ||||
| crawlers you want to run with `--crawler` or `-C`, like this: | ||||
|  | ||||
| ``` | ||||
| $ pferd -C crawler1 -C crawler2 | ||||
| ``` | ||||
|  | ||||
| ## Advanced usage | ||||
|  | ||||
| PFERD supports lots of different options. For example, you can configure PFERD | ||||
| to [use your system's keyring](CONFIG.md#the-keyring-authenticator) instead of | ||||
| prompting you for your username and password. PFERD also supports | ||||
| [transformation rules](CONFIG.md#transformation-rules) that let you rename or | ||||
| exclude certain files. | ||||
|  | ||||
| For more details, see the comprehensive [config format documentation](CONFIG.md). | ||||
|  | ||||
| ## Example | ||||
|  | ||||
| This example downloads a few courses from the KIT ILIAS with a common keyring | ||||
| authenticator. It reorganizes and ignores some files. | ||||
|  | ||||
| ```ini | ||||
| [DEFAULT] | ||||
| # All paths will be relative to this. | ||||
| # The crawler output directories will be <working_dir>/Foo and <working_dir>/Bar. | ||||
| working_dir = ~/stud | ||||
| # If files vanish from ILIAS the local files are not deleted, allowing us to | ||||
| # take a look at them before deleting them ourselves. | ||||
| on_conflict = no-delete | ||||
|  | ||||
| [auth:ilias] | ||||
| type = keyring | ||||
| username = foo | ||||
|  | ||||
| [crawl:Foo] | ||||
| type = kit-ilias-web | ||||
| auth = auth:ilias | ||||
| # Crawl a course by its ID (found as `ref_id=ID` in the URL) | ||||
| target = 1234567 | ||||
|  | ||||
| # Plaintext files are easier to read by other tools | ||||
| links = plaintext | ||||
|  | ||||
| transform = | ||||
|   # Ignore unneeded folders | ||||
|   Online-Tests --> ! | ||||
|   Vorlesungswerbung --> ! | ||||
|  | ||||
|   # Rename folders | ||||
|   Lehrbücher --> Vorlesung | ||||
|   # Note the ">>" arrow head which lets us apply further rules to files moved to "Übung" | ||||
|   Übungsunterlagen -->> Übung | ||||
|  | ||||
|   # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly | ||||
|   "Übung/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf | ||||
|   # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly | ||||
|   "Übung/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf | ||||
|  | ||||
|   # The course has nested folders with the same name - flatten them | ||||
|   "Übung/(.+?)/\\1" -re-> Übung/{g1} | ||||
|  | ||||
| [crawl:Bar] | ||||
| type = kit-ilias-web | ||||
| auth = auth:ilias | ||||
| target = 1337420 | ||||
| ``` | ||||
|   | ||||
| @@ -1,131 +0,0 @@ | ||||
| import argparse | ||||
| from pathlib import Path, PurePath | ||||
|  | ||||
| from PFERD import Pferd | ||||
| from PFERD.ilias import IliasElementType | ||||
| from PFERD.transform import (attempt, do, glob, keep, move, move_dir, | ||||
|                              optionally, re_move, re_rename) | ||||
|  | ||||
| tf_ss_2020_numerik = attempt( | ||||
|     re_move(r"Übungsblätter/(\d+)\. Übungsblatt/.*", "Blätter/Blatt_{1:0>2}.pdf"), | ||||
|     keep, | ||||
| ) | ||||
|  | ||||
|  | ||||
| tf_ss_2020_db = attempt( | ||||
|     move_dir("Begrüßungsvideo/", "Vorlesung/Videos/"), | ||||
|     do( | ||||
|         move_dir("Vorlesungsmaterial/Vorlesungsvideos/", "Vorlesung/Videos/"), | ||||
|         optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")), | ||||
|         optionally(re_rename("(?i)dbs-(.+)", "{1}")), | ||||
|     ), | ||||
|     move_dir("Vorlesungsmaterial/", "Vorlesung/"), | ||||
|     keep, | ||||
| ) | ||||
|  | ||||
|  | ||||
| tf_ss_2020_rechnernetze = attempt( | ||||
|     re_move(r"Vorlesungsmaterial/.*/(.+?)\.mp4", "Vorlesung/Videos/{1}.mp4"), | ||||
|     move_dir("Vorlesungsmaterial/", "Vorlesung/"), | ||||
|     keep, | ||||
| ) | ||||
|  | ||||
|  | ||||
| tf_ss_2020_sicherheit = attempt( | ||||
|     move_dir("Vorlesungsvideos/", "Vorlesung/Videos/"), | ||||
|     move_dir("Übungsvideos/", "Übung/Videos/"), | ||||
|     re_move(r"VL(.*)\.pdf", "Vorlesung/{1}.pdf"), | ||||
|     re_move(r"Übungsblatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf"), | ||||
|     move("Chiffrat.txt", "Blätter/Blatt_01_Chiffrat.txt"), | ||||
|     keep, | ||||
| ) | ||||
|  | ||||
|  | ||||
| tf_ss_2020_pg = attempt( | ||||
|     move_dir("Vorlesungsaufzeichnungen/", "Vorlesung/Videos/"), | ||||
|     move_dir("Vorlesungsmaterial/", "Vorlesung/"), | ||||
|     re_move(r"Übungen/uebungsblatt(\d+).pdf", "Blätter/Blatt_{1:0>2}.pdf"), | ||||
|     keep, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def df_ss_2020_or1(path: PurePath, _type: IliasElementType) -> bool: | ||||
|     if glob("Tutorien/")(path): | ||||
|         return True | ||||
|     if glob("Tutorien/Tutorium 10, dienstags 15:45 Uhr/")(path): | ||||
|         return True | ||||
|     if glob("Tutorien/*")(path): | ||||
|         return False | ||||
|     return True | ||||
|  | ||||
|  | ||||
| tf_ss_2020_or1 = attempt( | ||||
|     move_dir("Vorlesung/Unbeschriebene Folien/", "Vorlesung/Folien/"), | ||||
|     move_dir("Video zur Organisation/", "Vorlesung/Videos/"), | ||||
|     keep, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("--test-run", action="store_true") | ||||
|     parser.add_argument("synchronizers", nargs="*") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     pferd = Pferd(Path(__file__).parent, test_run=args.test_run) | ||||
|     pferd.enable_logging() | ||||
|  | ||||
|     if not args.synchronizers or "numerik" in args.synchronizers: | ||||
|         pferd.ilias_kit( | ||||
|             target="Numerik", | ||||
|             course_id="1083036", | ||||
|             transform=tf_ss_2020_numerik, | ||||
|             cookies="ilias_cookies.txt", | ||||
|         ) | ||||
|  | ||||
|     if not args.synchronizers or "db" in args.synchronizers: | ||||
|         pferd.ilias_kit( | ||||
|             target="DB", | ||||
|             course_id="1101554", | ||||
|             transform=tf_ss_2020_db, | ||||
|             cookies="ilias_cookies.txt", | ||||
|         ) | ||||
|  | ||||
|     if not args.synchronizers or "rechnernetze" in args.synchronizers: | ||||
|         pferd.ilias_kit( | ||||
|             target="Rechnernetze", | ||||
|             course_id="1099996", | ||||
|             transform=tf_ss_2020_rechnernetze, | ||||
|             cookies="ilias_cookies.txt", | ||||
|         ) | ||||
|  | ||||
|     if not args.synchronizers or "sicherheit" in args.synchronizers: | ||||
|         pferd.ilias_kit( | ||||
|             target="Sicherheit", | ||||
|             course_id="1101980", | ||||
|             transform=tf_ss_2020_sicherheit, | ||||
|             cookies="ilias_cookies.txt", | ||||
|         ) | ||||
|  | ||||
|     if not args.synchronizers or "pg" in args.synchronizers: | ||||
|         pferd.ilias_kit( | ||||
|             target="PG", | ||||
|             course_id="1106095", | ||||
|             transform=tf_ss_2020_pg, | ||||
|             cookies="ilias_cookies.txt", | ||||
|         ) | ||||
|  | ||||
|     if not args.synchronizers or "or1" in args.synchronizers: | ||||
|         pferd.ilias_kit( | ||||
|             target="OR1", | ||||
|             course_id="1105941", | ||||
|             dir_filter=df_ss_2020_or1, | ||||
|             transform=tf_ss_2020_or1, | ||||
|             cookies="ilias_cookies.txt", | ||||
|         ) | ||||
|  | ||||
|     # Prints a summary listing all new, modified or deleted files | ||||
|     pferd.print_summary() | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
| @@ -1,38 +0,0 @@ | ||||
| """ | ||||
| This is a small config that just crawls the ILIAS Personal Desktop. | ||||
| It does not filter or rename anything, it just gobbles up everything it can find. | ||||
|  | ||||
| Note that this still includes a test-run switch, so you can see what it *would* download. | ||||
| You can enable that with the "--test-run" command line switch, | ||||
| i. e. "python3 example_config_minimal.py --test-run". | ||||
| """ | ||||
|  | ||||
| import argparse | ||||
| from pathlib import Path | ||||
|  | ||||
| from PFERD import Pferd | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     # Parse command line arguments | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("--test-run", action="store_true") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # Create the Pferd helper instance | ||||
|     pferd = Pferd(Path(__file__).parent, test_run=args.test_run) | ||||
|     pferd.enable_logging() | ||||
|  | ||||
|     # Synchronize the personal desktop into the "ILIAS" directory. | ||||
|     # It saves the cookies, so you only need to log in again when the ILIAS cookies expire. | ||||
|     pferd.ilias_kit_personal_desktop( | ||||
|         "ILIAS", | ||||
|         cookies="ilias_cookies.txt", | ||||
|     ) | ||||
|  | ||||
|     # Prints a summary listing all new, modified or deleted files | ||||
|     pferd.print_summary() | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										6
									
								
								mypy.ini
									
									
									
									
									
								
							
							
						
						
									
										6
									
								
								mypy.ini
									
									
									
									
									
								
							| @@ -1,7 +1,11 @@ | ||||
| [mypy] | ||||
| disallow_any_generics = True | ||||
| disallow_untyped_defs = True | ||||
| disallow_incomplete_defs = True | ||||
| no_implicit_optional = True | ||||
| warn_unused_ignores = True | ||||
| warn_unreachable = True | ||||
| show_error_context = True | ||||
|  | ||||
| [mypy-rich.*,bs4] | ||||
| [mypy-rich.*,bs4,keyring] | ||||
| ignore_missing_imports = True | ||||
|   | ||||
							
								
								
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| # File used by pyinstaller to create the executable | ||||
|  | ||||
| from PFERD.__main__ import main | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										3
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| [build-system] | ||||
| requires = ["setuptools", "wheel"] | ||||
| build-backend = "setuptools.build_meta" | ||||
							
								
								
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| pyinstaller --onefile pferd.py | ||||
							
								
								
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,111 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| import argparse | ||||
| import time | ||||
| import re | ||||
| from subprocess import run | ||||
|  | ||||
|  | ||||
| def load_changelog(): | ||||
|     with open("CHANGELOG.md") as f: | ||||
|         return list(f) | ||||
|  | ||||
|  | ||||
| def extract_changes(lines): | ||||
|     lines = iter(lines) | ||||
|     changes = [] | ||||
|  | ||||
|     # Find "Unreleased" section | ||||
|     for line in lines: | ||||
|         if line.strip() == "## Unreleased": | ||||
|             break | ||||
|     next(lines) | ||||
|  | ||||
|     # Read all lines from that section | ||||
|     for line in lines: | ||||
|         if line.startswith("## "): | ||||
|             # Found the beginning of the next section | ||||
|             break | ||||
|         elif line.startswith("### "): | ||||
|             # Found a heading in the current section | ||||
|             # Remove "#" symbols so git doesn't interpret the line as a comment later | ||||
|             changes.append(line[4:]) | ||||
|         else: | ||||
|             changes.append(line) | ||||
|  | ||||
|     # Remove trailing empty lines | ||||
|     while changes and not changes[-1].strip(): | ||||
|         changes.pop() | ||||
|  | ||||
|     return changes | ||||
|  | ||||
|  | ||||
| def update_version(version): | ||||
|     with open("PFERD/version.py") as f: | ||||
|         text = f.read() | ||||
|  | ||||
|     text = re.sub(r'VERSION = ".*"', f'VERSION = "{version}"', text) | ||||
|  | ||||
|     with open("PFERD/version.py", "w") as f: | ||||
|         f.write(text) | ||||
|  | ||||
|  | ||||
| def update_changelog(lines, version, date): | ||||
|     lines = iter(lines) | ||||
|     new_lines = [] | ||||
|  | ||||
|     # Find "Unreleased" section | ||||
|     for line in lines: | ||||
|         new_lines.append(line) | ||||
|         if line.strip() == "## Unreleased": | ||||
|             break | ||||
|  | ||||
|     # Add new heading below that | ||||
|     new_lines.append("\n") | ||||
|     new_lines.append(f"## {version} - {date}\n") | ||||
|  | ||||
|     # Add remaining lines | ||||
|     for line in lines: | ||||
|         new_lines.append(line) | ||||
|  | ||||
|     with open("CHANGELOG.md", "w") as f: | ||||
|         f.write("".join(new_lines)) | ||||
|  | ||||
|  | ||||
| def commit_changes(version): | ||||
|     run(["git", "add", "CHANGELOG.md", "PFERD/version.py"]) | ||||
|     run(["git", "commit", "-m", f"Bump version to {version}"]) | ||||
|  | ||||
|  | ||||
| def create_tag(version, annotation): | ||||
|     run(["git", "tag", "-am", annotation, f"v{version}"]) | ||||
|  | ||||
|  | ||||
| def fastforward_latest(): | ||||
|     run(["git", "branch", "-f", "latest", "HEAD"]) | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("version") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     version = args.version | ||||
|     date = time.strftime("%Y-%m-%d") | ||||
|     changelog = load_changelog() | ||||
|     changes = extract_changes(changelog) | ||||
|     annotation = f"Version {version} - {date}\n\n{''.join(changes)}" | ||||
|  | ||||
|     update_version(version) | ||||
|     update_changelog(changelog, version, date) | ||||
|     commit_changes(version) | ||||
|     create_tag(version, annotation) | ||||
|     fastforward_latest() | ||||
|  | ||||
|     print() | ||||
|     print("Now the only thing left is to publish the changes:") | ||||
|     print(f"  $ git push origin master latest v{version}") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| mypy PFERD | ||||
| flake8 PFERD | ||||
							
								
								
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| autopep8 --recursive --in-place PFERD | ||||
| isort PFERD | ||||
							
								
								
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # Updating pip and setuptools because some older versions don't recognize the | ||||
| # project setup correctly | ||||
| if [[ $1 != '--no-pip' ]]; then | ||||
|     pip install --upgrade pip | ||||
| fi | ||||
| pip install --upgrade setuptools | ||||
|  | ||||
| # Installing PFERD itself | ||||
| pip install --editable . | ||||
|  | ||||
| # Installing tools and type hints | ||||
| pip install --upgrade mypy flake8 autopep8 isort pyinstaller | ||||
| pip install --upgrade types-chardet types-certifi | ||||
							
								
								
									
										23
									
								
								setup.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								setup.cfg
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| [metadata] | ||||
| name = PFERD | ||||
| version = attr: PFERD.version.VERSION | ||||
|  | ||||
| [options] | ||||
| packages = find: | ||||
| python_requires = >=3.9 | ||||
| install_requires = | ||||
|   aiohttp>=3.8.1 | ||||
|   beautifulsoup4>=4.10.0 | ||||
|   rich>=11.0.0 | ||||
|   keyring>=23.5.0 | ||||
|   certifi>=2021.10.8 | ||||
|  | ||||
| [options.entry_points] | ||||
| console_scripts = | ||||
|   pferd = PFERD.__main__:main | ||||
|  | ||||
| [flake8] | ||||
| max_line_length = 110 | ||||
|  | ||||
| [isort] | ||||
| line_length = 110 | ||||
							
								
								
									
										16
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								setup.py
									
									
									
									
									
								
							| @@ -1,16 +0,0 @@ | ||||
| from setuptools import find_packages, setup | ||||
|  | ||||
| setup( | ||||
|     name="PFERD", | ||||
|     version="2.4.2", | ||||
|     packages=find_packages(), | ||||
|     install_requires=[ | ||||
|         "requests>=2.21.0", | ||||
|         "beautifulsoup4>=4.7.1", | ||||
|         "rich>=2.1.0" | ||||
|     ], | ||||
| ) | ||||
|  | ||||
| # When updating the version, also: | ||||
| # - update the README.md installation instructions | ||||
| # - set a tag on the update commit | ||||
							
								
								
									
										67
									
								
								sync_url.py
									
									
									
									
									
								
							
							
						
						
									
										67
									
								
								sync_url.py
									
									
									
									
									
								
							| @@ -1,67 +0,0 @@ | ||||
| #!/usr/bin/env python | ||||
|  | ||||
| """ | ||||
| A simple script to download a course by name from ILIAS. | ||||
| """ | ||||
|  | ||||
| import argparse | ||||
| from pathlib import Path | ||||
| from urllib.parse import urlparse | ||||
|  | ||||
| from PFERD import Pferd | ||||
| from PFERD.cookie_jar import CookieJar | ||||
| from PFERD.ilias import (IliasCrawler, IliasElementType, | ||||
|                          KitShibbolethAuthenticator) | ||||
| from PFERD.utils import to_path | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("--test-run", action="store_true") | ||||
|     parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") | ||||
|     parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") | ||||
|     parser.add_argument('url', help="URL to the course page") | ||||
|     parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     url = urlparse(args.url) | ||||
|  | ||||
|     cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) | ||||
|     session = cookie_jar.create_session() | ||||
|     authenticator = KitShibbolethAuthenticator() | ||||
|     crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, | ||||
|                            authenticator, lambda x, y: True) | ||||
|  | ||||
|     cookie_jar.load_cookies() | ||||
|  | ||||
|     if args.folder is not None: | ||||
|         folder = args.folder | ||||
|         # Initialize pferd at the *parent of the passed folder* | ||||
|         # This is needed so Pferd's internal protections against escaping the working directory | ||||
|         # do not trigger (e.g. if somebody names a file in ILIAS '../../bad thing.txt') | ||||
|         pferd = Pferd(Path(Path(__file__).parent, folder).parent, test_run=args.test_run) | ||||
|     else: | ||||
|         # fetch course name from ilias | ||||
|         folder = crawler.find_element_name(args.url) | ||||
|         cookie_jar.save_cookies() | ||||
|  | ||||
|         # Initialize pferd at the location of the script | ||||
|         pferd = Pferd(Path(__file__).parent, test_run=args.test_run) | ||||
|  | ||||
|     def dir_filter(_: Path, element: IliasElementType) -> bool: | ||||
|         if args.no_videos: | ||||
|             return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER] | ||||
|         return True | ||||
|  | ||||
|     pferd.enable_logging() | ||||
|     # fetch | ||||
|     pferd.ilias_kit_folder( | ||||
|         target=folder, | ||||
|         full_url=args.url, | ||||
|         cookies=args.cookies, | ||||
|         dir_filter=dir_filter | ||||
|     ) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
		Reference in New Issue
	
	Block a user