mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-22 09:42:31 +02:00 
			
		
		
		
	Compare commits
	
		
			579 Commits
		
	
	
		
			v1.1.6
			...
			update-che
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 2d145e7c94 | ||
|   | 5fdd40204b | ||
|   | fb4631ba18 | ||
|   | d72fc2760b | ||
|   | 4a51aaa4f5 | ||
|   | 66a5b1ba02 | ||
|   | aa5a3a10bc | ||
|   | d9b111cec2 | ||
|   | 345f52a1f6 | ||
|   | ed24366aba | ||
|   | 46fb782798 | ||
|   | 846c29aee1 | ||
|   | a5015fe9b1 | ||
|   | 616b0480f7 | ||
|   | 2f0e04ce13 | ||
|   | bcc537468c | ||
|   | 694ffb4d77 | ||
|   | af2cc1169a | ||
|   | bc3fa36637 | ||
|   | afbd03f777 | ||
|   | b8fe25c580 | ||
|   | a241672726 | ||
|   | a8f76e9be7 | ||
|   | b56475450d | ||
|   | aa74604d29 | ||
|   | d2e6d91880 | ||
|   | 602044ff1b | ||
|   | 31631fb409 | ||
|   | 00db348218 | ||
|   | a709280cbf | ||
|   | a99ddaa0cc | ||
|   | ba3d299c05 | ||
|   | 07a21f80a6 | ||
|   | f17b9b68f4 | ||
|   | a2831fbea2 | ||
|   | da72863b47 | ||
|   | 86e2e226dc | ||
|   | 7872fe5221 | ||
|   | 86947e4874 | ||
|   | 4f022e2d19 | ||
|   | f47e7374d2 | ||
|   | 57ec51e95a | ||
|   | 0045124a4e | ||
|   | 9618aae83b | ||
|   | 33453ede2d | ||
|   | e467b38d73 | ||
|   | e9d2d05030 | ||
|   | 4bf0c972e6 | ||
|   | 4ee919625d | ||
|   | d30f25ee97 | ||
|   | 10d9d74528 | ||
|   | 43c5453e10 | ||
|   | eb4de8ae0c | ||
|   | e32c1f000f | ||
|   | 5f527bc697 | ||
|   | ced8b9a2d0 | ||
|   | 6f3cfd4396 | ||
|   | 462d993fbc | ||
|   | a99356f2a2 | ||
|   | eac2e34161 | ||
|   | a82a0b19c2 | ||
|   | 90cb6e989b | ||
|   | 6289938d7c | ||
|   | 13b8c3d9c6 | ||
|   | 88afe64a92 | ||
|   | 6b2a657573 | ||
|   | d6f38a61e1 | ||
|   | ad3f4955f7 | ||
|   | e42ab83d32 | ||
|   | f9a3f9b9f2 | ||
|   | ef7d5ea2d3 | ||
|   | 55ea304ff3 | ||
|   | fee12b3d9e | ||
|   | 6673077397 | ||
|   | 742632ed8d | ||
|   | 544d45cbc5 | ||
|   | 86f79ff1f1 | ||
|   | ee67f9f472 | ||
|   | 8ec3f41251 | ||
|   | 89be07d4d3 | ||
|   | 91200f3684 | ||
|   | 9ffd603357 | ||
|   | 80eeb8fe97 | ||
|   | 75fde870c2 | ||
|   | 6e4d423c81 | ||
|   | 57aef26217 | ||
|   | 70ec64a48b | ||
|   | 70b33ecfd9 | ||
|   | 601e4b936b | ||
|   | a292c4c437 | ||
|   | bc65ea7ab6 | ||
|   | f28bbe6b0c | ||
|   | 61d902d715 | ||
|   | 8ab462fb87 | ||
|   | df3ad3d890 | ||
|   | fc31100a0f | ||
|   | 31b6311e99 | ||
|   | 1fc8e9eb7a | ||
|   | 85b9f45085 | ||
|   | f656e3ff34 | ||
|   | e1bda94329 | ||
|   | f6b26f4ead | ||
|   | 722970a255 | ||
|   | f40820c41f | ||
|   | 49ad1b6e46 | ||
|   | 1ce32d2f18 | ||
|   | 9d5ec84b91 | ||
|   | 1fba96abcb | ||
|   | 921cec7ddc | ||
|   | 7b062883f6 | ||
|   | 64a2960751 | ||
|   | 17879a7f69 | ||
|   | 1dd24551a5 | ||
|   | 84f775013f | ||
|   | b78eb64f3d | ||
|   | d65efed561 | ||
|   | 1ca6740e05 | ||
|   | 474aa7e1cc | ||
|   | 5beb4d9a2d | ||
|   | 19eed5bdff | ||
|   | 6fa9cfd4c3 | ||
|   | 80acc4b50d | ||
|   | 2c72a9112c | ||
|   | 17207546e9 | ||
|   | 533f75ea71 | ||
|   | adb5d4ade3 | ||
|   | a879c6ab6e | ||
|   | 915e42fd07 | ||
|   | 2d8dcc87ff | ||
|   | 66f0e398a1 | ||
|   | 30be4e29fa | ||
|   | 263780e6a3 | ||
|   | 07a75a37c3 | ||
|   | f85b75df8c | ||
|   | 6644126b5d | ||
|   | c665c36d88 | ||
|   | 519a7ef435 | ||
|   | a848194601 | ||
|   | aabce764ac | ||
|   | 5a331663e4 | ||
|   | 40144f8bd8 | ||
|   | f68849c65f | ||
|   | edb52a989e | ||
|   | 980578d05a | ||
|   | 486699cef3 | ||
|   | 0096a0c077 | ||
|   | d905e95dbb | ||
|   | 61430c8739 | ||
|   | eb8b915813 | ||
|   | 22c2259adb | ||
|   | c15a1aecdf | ||
|   | 16d50b6626 | ||
|   | 651b087932 | ||
|   | bce3dc384d | ||
|   | c21ddf225b | ||
|   | 4fefb98d71 | ||
|   | ffda4e43df | ||
|   | 69cb2a7734 | ||
|   | c33de233dc | ||
|   | 85f89a7ff3 | ||
|   | 9ce20216b5 | ||
|   | 1739c54091 | ||
|   | d8bd1f518a | ||
|   | 86ba47541b | ||
|   | 492ec6a932 | ||
|   | 342076ee0e | ||
|   | d44f6966c2 | ||
|   | 5c76193045 | ||
|   | 1c1f781be4 | ||
|   | c687d4a51a | ||
|   | fca62541ca | ||
|   | 3ab3581f84 | ||
|   | 8dd0689420 | ||
|   | be4b1040f8 | ||
|   | 79be6e1dc5 | ||
|   | edbd92dbbf | ||
|   | 27b5a8e490 | ||
|   | 1f400d5964 | ||
|   | 0ca0680165 | ||
|   | ce1dbda5b4 | ||
|   | 9cce78669f | ||
|   | 6ca0ecdf05 | ||
|   | 6e9f8fd391 | ||
|   | 2fdf24495b | ||
|   | bbf9f8f130 | ||
|   | 37f8d84a9c | ||
|   | 5edd868d5b | ||
|   | e4e5e83be6 | ||
|   | 74c7b39dc8 | ||
|   | 445dffc987 | ||
|   | d97d6bf147 | ||
|   | 79efdb56f7 | ||
|   | a9af56a5e9 | ||
|   | 59f13bb8d6 | ||
|   | 463f8830d7 | ||
|   | 05ad06fbc1 | ||
|   | 29d5a40c57 | ||
|   | c0cecf8363 | ||
|   | b998339002 | ||
|   | 245c9c3dcc | ||
|   | d8f26a789e | ||
|   | e1d18708b3 | ||
|   | b44b49476d | ||
|   | 7e0bb06259 | ||
|   | ecdedfa1cf | ||
|   | 3d4b997d4a | ||
|   | e81005ae4b | ||
|   | 33a81a5f5c | ||
|   | 25e2abdb03 | ||
|   | 803e5628a2 | ||
|   | c88f20859a | ||
|   | ec3767c545 | ||
|   | 729ff0a4c7 | ||
|   | 6fe51e258f | ||
|   | 44ecb2fbe7 | ||
|   | 53e031d9f6 | ||
|   | 8ac85ea0bd | ||
|   | adfdc302d7 | ||
|   | 3053278721 | ||
|   | 4d07de0d71 | ||
|   | 953a1bba93 | ||
|   | e724ff7c93 | ||
|   | 62f0f7bfc5 | ||
|   | 9cb2b68f09 | ||
|   | 1bbc0b705f | ||
|   | 662191eca9 | ||
|   | 8fad8edc1e | ||
|   | ae3d80664c | ||
|   | e21795ee35 | ||
|   | ec95dda18f | ||
|   | 098ac45758 | ||
|   | 9889ce6b57 | ||
|   | b4d97cd545 | ||
|   | afac22c562 | ||
|   | 552cd82802 | ||
|   | dfde0e2310 | ||
|   | 54dd2f8337 | ||
|   | b5785f260e | ||
|   | 98b8ca31fa | ||
|   | 4b104b6252 | ||
|   | 83d12fcf2d | ||
|   | e4f9560655 | ||
|   | 8cfa818f04 | ||
|   | 81301f3a76 | ||
|   | 2976b4d352 | ||
|   | 9f03702e69 | ||
|   | 3300886120 | ||
|   | 0d10752b5a | ||
|   | 92886fb8d8 | ||
|   | 5916626399 | ||
|   | a7c025fd86 | ||
|   | b7a999bc2e | ||
|   | 3851065500 | ||
|   | 4b68fa771f | ||
|   | 1525aa15a6 | ||
|   | db1219d4a9 | ||
|   | b8efcc2ca5 | ||
|   | 0bae009189 | ||
|   | 3efec53f51 | ||
|   | 8b76ebb3ef | ||
|   | 467ea3a37e | ||
|   | 2b6235dc78 | ||
|   | cd5aa61834 | ||
|   | 5ccb17622e | ||
|   | 1c226c31aa | ||
|   | 9ec0d3e16a | ||
|   | cf6903d109 | ||
|   | 9fd356d290 | ||
|   | 989032fe0c | ||
|   | 05573ccc53 | ||
|   | c454fabc9d | ||
|   | 7d323ec62b | ||
|   | c7494e32ce | ||
|   | 1123c8884d | ||
|   | e1104f888d | ||
|   | 8c32da7f19 | ||
|   | d63494908d | ||
|   | b70b62cef5 | ||
|   | 868f486922 | ||
|   | b2a2b5999b | ||
|   | 595de88d96 | ||
|   | a6fdf05ee9 | ||
|   | f897d7c2e1 | ||
|   | b0f731bf84 | ||
|   | 302b8c0c34 | ||
|   | acd674f0a0 | ||
|   | b0f9e1e8b4 | ||
|   | ed2e19a150 | ||
|   | 296a169dd3 | ||
|   | 1591cb9197 | ||
|   | 0c9167512c | ||
|   | a673ab0fae | ||
|   | 6e5fdf4e9e | ||
|   | 93a5a94dab | ||
|   | d565df27b3 | ||
|   | 961f40f9a1 | ||
|   | e3ee4e515d | ||
|   | 94d6a01cca | ||
|   | 38bb66a776 | ||
|   | 68781a88ab | ||
|   | 910462bb72 | ||
|   | 6bd6adb977 | ||
|   | 0acdee15a0 | ||
|   | c3ce6bb31c | ||
|   | 0459ed093e | ||
|   | d5f29f01c5 | ||
|   | 595ba8b7ab | ||
|   | cec0a8e1fc | ||
|   | f9b2fd60e2 | ||
|   | 60cd9873bc | ||
|   | 273d56c39a | ||
|   | 5497dd2827 | ||
|   | bbfdadc463 | ||
|   | fde811ae5a | ||
|   | 07e831218e | ||
|   | 91c33596da | ||
|   | a8dcf941b9 | ||
|   | e7a51decb0 | ||
|   | 9ec19be113 | ||
|   | f776186480 | ||
|   | 0096d83387 | ||
|   | 20a24dbcbf | ||
|   | 502654d853 | ||
|   | d2103d7c44 | ||
|   | d96a361325 | ||
|   | 2e85d26b6b | ||
|   | 6431a3fb3d | ||
|   | ac3bfd7388 | ||
|   | 3ea86d18a0 | ||
|   | bbc792f9fb | ||
|   | 7e127cd5cc | ||
|   | c4fb92c658 | ||
|   | 8da1ac6cee | ||
|   | a18db57e6f | ||
|   | b915e393dd | ||
|   | 3a74c23d09 | ||
|   | fbebc46c58 | ||
|   | 5595a908d8 | ||
|   | 27e4abcfa3 | ||
|   | c1ab7485e2 | ||
|   | 29cd5d1a3c | ||
|   | 6d5d9333ad | ||
|   | 7cc40595dc | ||
|   | 80ae5ddfaa | ||
|   | 4f480d117e | ||
|   | 1f2af3a290 | ||
|   | 14cdfb6a69 | ||
|   | e2bf84392b | ||
|   | 946b7a7931 | ||
|   | 9a9018751e | ||
|   | 83b75e8254 | ||
|   | 35c3fa205d | ||
|   | 0b606f02fa | ||
|   | fb78a6e98e | ||
|   | 5de68a0400 | ||
|   | f0562049b6 | ||
|   | 0e1077bb50 | ||
|   | c978e9edf4 | ||
|   | 2714ac6be6 | ||
|   | 9b048a9cfc | ||
|   | 1c2b6bf994 | ||
|   | ee39aaf08b | ||
|   | 93e6329901 | ||
|   | f47b137b59 | ||
|   | 83ea15ee83 | ||
|   | 75471c46d1 | ||
|   | 1e0343bba6 | ||
|   | 0f5e55648b | ||
|   | 57259e21f4 | ||
|   | 4ce385b262 | ||
|   | 2d64409542 | ||
|   | fcb3884a8f | ||
|   | 9f6dc56a7b | ||
|   | 56ab473611 | ||
|   | 6426060804 | ||
|   | 49a0ca7a7c | ||
|   | f3a4663491 | ||
|   | ecdbca8fb6 | ||
|   | 9cbea5fe06 | ||
|   | ba3c7f85fa | ||
|   | ba9215ebe8 | ||
|   | 8ebf0eab16 | ||
|   | cd90a60dee | ||
|   | 98834c9c95 | ||
|   | 55e9e719ad | ||
|   | a0ae9aee27 | ||
|   | 1486a63854 | ||
|   | 733e1ae136 | ||
|   | 4ac51048c1 | ||
|   | f2aba970fd | ||
|   | 9c4759103a | ||
|   | 316b9d7bf4 | ||
|   | 6f30adcd22 | ||
|   | 6f78fef604 | ||
|   | f830b42a36 | ||
|   | ef343dec7c | ||
|   | 0da2fafcd8 | ||
|   | f4abe3197c | ||
|   | 38d4f5b4c9 | ||
|   | 9ea03bda3e | ||
|   | 07de5bea8b | ||
|   | f0d572c110 | ||
|   | 076067e22d | ||
|   | ebb6e63c5c | ||
|   | 0c3f35a2d2 | ||
|   | 521890ae78 | ||
|   | 3f7c73df80 | ||
|   | 43100f69d5 | ||
|   | d73c778b0a | ||
|   | 73c3eb0984 | ||
|   | a519cbe05d | ||
|   | b3ad9783c4 | ||
|   | c1ccb6c53e | ||
|   | 51a713fa04 | ||
|   | 74ea039458 | ||
|   | aaa6a2b6a4 | ||
|   | e32a49480b | ||
|   | be65051f9d | ||
|   | 3387bc5f20 | ||
|   | 3f0ae729d6 | ||
|   | 8e8c1c031a | ||
|   | 55678d7fee | ||
|   | a57ee8b96b | ||
|   | e367da925e | ||
|   | 77a109bb7e | ||
|   | a3e1864a26 | ||
|   | 41cbcc509c | ||
|   | 77874b432b | ||
|   | 5c4c785e60 | ||
|   | 2aed4f6d1f | ||
|   | 34152fbe54 | ||
|   | 4047fe78f3 | ||
|   | c28347122e | ||
|   | 5b38ab8cf1 | ||
|   | bb25d32f03 | ||
|   | ecaedea709 | ||
|   | f05d1b1261 | ||
|   | 6aaa3071f9 | ||
|   | c26c9352f1 | ||
|   | d9ea688145 | ||
|   | e8be6e498e | ||
|   | e4b1fac045 | ||
|   | 402ae81335 | ||
|   | 52f31e2783 | ||
|   | 739522a151 | ||
|   | 6c034209b6 | ||
|   | f6fbd5e4bb | ||
|   | 7024db1f13 | ||
|   | 23bfa42a0d | ||
|   | fdb57884ed | ||
|   | f614b95a00 | ||
|   | 8198c9ecaa | ||
|   | 086b15d10f | ||
|   | 9d6ce331a5 | ||
|   | 821c7ade26 | ||
|   | b969a1854a | ||
|   | 62535b4452 | ||
|   | c0056e5669 | ||
|   | cfe4a8fc0a | ||
|   | 95b9248a25 | ||
|   | 1004fa40f8 | ||
|   | e8ddb0ca04 | ||
|   | 36c8785f15 | ||
|   | 03a801eecc | ||
|   | 072c6630bf | ||
|   | 4f56c8f192 | ||
|   | 4fdb67128d | ||
|   | a0f9d31d94 | ||
|   | e7b08420ba | ||
|   | c1b21f7772 | ||
|   | 9850ab1d73 | ||
|   | 9950144e97 | ||
|   | f6faacabb0 | ||
|   | 19c1e3ac6f | ||
|   | afa48c2d2d | ||
|   | a4c518bf4c | ||
|   | 057135022f | ||
|   | 755e9aa0d3 | ||
|   | c9deca19ca | ||
|   | bb048c3a6d | ||
|   | 33fcd307b2 | ||
|   | a0c5572b59 | ||
|   | 2d20d2934c | ||
|   | 2c48ab66d4 | ||
|   | 104b838aed | ||
|   | 7f10931be8 | ||
|   | 07c225bc20 | ||
|   | 56f2394001 | ||
|   | fdff8bc40e | ||
|   | bee3d70998 | ||
|   | 42345ecc61 | ||
|   | 920d521d68 | ||
|   | e0b46a306a | ||
|   | 8a42a2a396 | ||
|   | 80247400a4 | ||
|   | 13c5a29ff0 | ||
|   | 1aaa6e7ab5 | ||
|   | 7f53543324 | ||
|   | 292e516297 | ||
|   | 8258fa8919 | ||
|   | 5b929f09a2 | ||
|   | 4d32f863bc | ||
|   | 4e7333b396 | ||
|   | 4c0e3b493a | ||
|   | 2de079a5d3 | ||
|   | 509e624d47 | ||
|   | ca8fcf7a1d | ||
|   | 980f69b5af | ||
|   | 0b00a9c26b | ||
|   | 1ef85c45e5 | ||
|   | 5ef5a56e69 | ||
|   | f3f4be2690 | ||
|   | 076b8c5a1f | ||
|   | 13bc78c889 | ||
|   | dc964a9d98 | ||
|   | c2b14f3db9 | ||
|   | 4b59a7c375 | ||
|   | 3a57430893 | ||
|   | bef210ae77 | ||
|   | ea005517cf | ||
|   | 3841f27aab | ||
|   | df0eb84a44 | ||
|   | 2de4255a78 | ||
|   | 3c808879c9 | ||
|   | a051e3bcca | ||
|   | eb7df036df | ||
|   | 23db59e733 | ||
|   | ac65b06a8e | ||
|   | 8891041069 | ||
|   | 70d63e3e90 | ||
|   | b2a7af2e3e | ||
|   | 23bed48c8c | ||
|   | 0926d33798 | ||
|   | 55ba2f4070 | ||
|   | d18b48aaf4 | ||
|   | 4ef0ffe3bf | ||
|   | ce77995c8f | ||
|   | ed9245c14d | ||
|   | 01e6972c96 | ||
|   | 8181ae5b17 | ||
|   | 6407190ae0 | ||
|   | 87395faac2 | ||
|   | a9e6e7883d | ||
|   | 154d6b29dd | ||
|   | 62ac569ec4 | ||
|   | 9f1a0a58ab | ||
|   | 879a2c7c80 | ||
|   | ff06c5215e | ||
|   | 135a8dce4b | ||
|   | 63bbcad918 | ||
|   | 6584d6a905 | ||
|   | 5990098ef8 | ||
|   | f3d3d6bb65 | ||
|   | b2fe7cc064 | ||
|   | 930d821dd7 | ||
|   | 5c2ff14839 | ||
|   | a3d6dc7873 | ||
|   | 53ad1c924b | ||
|   | 8c431c7d81 | ||
|   | d5dd5aac06 | ||
|   | 7d48972967 | ||
|   | 25043a4aaa | ||
|   | 7ebeef5873 | ||
|   | 9b658776ca | ||
|   | cf3553175f | ||
|   | bf8b3cf9f7 | ||
|   | 4a5600d5ce | ||
|   | f5bc49160f | ||
|   | fa5f82d312 | ||
|   | 4433696509 | ||
|   | 1f5475abc5 | ||
|   | 1407c6d264 | ||
|   | e152bfc4a3 | ||
|   | 1973c931bd | ||
|   | 458cc1c6d6 | ||
|   | 52852d11a6 | ||
|   | f94629a7fa | ||
|   | c8ee456d33 | ||
|   | 2752e98621 | 
							
								
								
									
										78
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,78 @@ | ||||
| name: build-and-release | ||||
|  | ||||
| on: push | ||||
|  | ||||
| defaults: | ||||
|   run: | ||||
|     shell: bash | ||||
|  | ||||
| jobs: | ||||
|  | ||||
|   build: | ||||
|     runs-on: ${{ matrix.os }} | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|         python: ["3.9"] | ||||
|     steps: | ||||
|  | ||||
|       - uses: actions/checkout@v2 | ||||
|  | ||||
|       - uses: actions/setup-python@v2 | ||||
|         with: | ||||
|           python-version: ${{ matrix.python }} | ||||
|  | ||||
|       - name: Set up project | ||||
|         if: matrix.os != 'windows-latest' | ||||
|         run: ./scripts/setup | ||||
|  | ||||
|       - name: Set up project on windows | ||||
|         if: matrix.os == 'windows-latest' | ||||
|         # For some reason, `pip install --upgrade pip` doesn't work on | ||||
|         # 'windows-latest'. The installed pip version works fine however. | ||||
|         run: ./scripts/setup --no-pip | ||||
|  | ||||
|       - name: Run checks | ||||
|         run: ./scripts/check | ||||
|  | ||||
|       - name: Build | ||||
|         run: ./scripts/build | ||||
|  | ||||
|       - name: Rename binary | ||||
|         # Glob in source location because on windows pyinstaller creates a file | ||||
|         # named "pferd.exe" | ||||
|         run: mv dist/pferd* dist/pferd-${{ matrix.os }} | ||||
|  | ||||
|       - name: Upload binary | ||||
|         uses: actions/upload-artifact@v2 | ||||
|         with: | ||||
|           name: Binaries | ||||
|           path: dist/pferd-${{ matrix.os }} | ||||
|  | ||||
|   release: | ||||
|     runs-on: ubuntu-latest | ||||
|     if: startsWith(github.ref, 'refs/tags/v') | ||||
|     needs: build | ||||
|     steps: | ||||
|  | ||||
|       - name: Download binaries | ||||
|         uses: actions/download-artifact@v2 | ||||
|         with: | ||||
|           name: Binaries | ||||
|  | ||||
|       - name: Rename binaries | ||||
|         run: | | ||||
|           mv pferd-ubuntu-latest pferd-linux | ||||
|           mv pferd-windows-latest pferd-windows.exe | ||||
|           mv pferd-macos-latest pferd-mac | ||||
|  | ||||
|       - name: Create release | ||||
|         uses: softprops/action-gh-release@v1 | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|         with: | ||||
|           files: | | ||||
|             pferd-linux | ||||
|             pferd-windows.exe | ||||
|             pferd-mac | ||||
							
								
								
									
										21
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										21
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,13 +1,10 @@ | ||||
| # python stuff | ||||
| __pycache__/ | ||||
|  | ||||
| # venv stuff | ||||
| bin/ | ||||
| include/ | ||||
| lib/ | ||||
| lib64 | ||||
| pyvenv.cfg | ||||
|  | ||||
| .tmp/ | ||||
| pip-selfcheck.json | ||||
| .mypy_cache/ | ||||
| /.venv/ | ||||
| /PFERD.egg-info/ | ||||
| __pycache__/ | ||||
| /.vscode/ | ||||
|  | ||||
| # pyinstaller | ||||
| /pferd.spec | ||||
| /build/ | ||||
| /dist/ | ||||
|   | ||||
							
								
								
									
										164
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,164 @@ | ||||
| # Changelog | ||||
|  | ||||
| All notable changes to this project will be documented in this file. The format | ||||
| is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). | ||||
|  | ||||
| This project has its own custom versioning scheme. Version numbers consist of | ||||
| three parts (e. g. `3.1.5`). | ||||
| - The first number is increased on major rewrites or changes. What classifies as | ||||
|   a major change is up to the maintainers. This is pretty rare and a PFERD | ||||
|   version 4 should hopefully not be necessary. | ||||
| - The second number is increased on backwards-incompatible changes in behaviour. | ||||
|   This refers to any change that would make an existing setup behave differently | ||||
|   (e. g. renaming options or changing crawler behaviour). If this number is | ||||
|   increased, it may be necessary for you to adapt your own setup. | ||||
| - The third number is increased on backwards-compatible changes (e. g. adding | ||||
|   new options or commands, changing documentation, fixing bugs). Updates that | ||||
|   only increase this number should be safe and not require manual intervention. | ||||
|  | ||||
| We will try to correctly classify changes as backwards-compatible or | ||||
| backwards-incompatible, but may occasionally make mistakes or stumble across | ||||
| ambiguous situations. | ||||
|  | ||||
| ## Unreleased | ||||
|  | ||||
| ### Fixed | ||||
| - Forum crawling crashing when parsing empty (= 0 messages) threads | ||||
| - Forum crawling crashing when a forum has no threads at all | ||||
|  | ||||
| ## 3.4.1 - 2022-08-17 | ||||
|  | ||||
| ### Added | ||||
| - Download of page descriptions | ||||
| - Forum download support | ||||
| - `pass` authenticator | ||||
|  | ||||
| ### Changed | ||||
| - Add `cpp` extension to default `link_regex` of IPD crawler | ||||
| - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option | ||||
| - Simplify default IPD crawler `link_regex` | ||||
|  | ||||
| ### Fixed | ||||
| - IPD crawler crashes on some sites | ||||
| - Meeting name normalization for yesterday, today and tomorrow | ||||
| - Crawling of meeting file previews | ||||
| - Login with new login button html layout | ||||
| - Descriptions for courses are now placed in the correct subfolder when | ||||
|   downloading the whole desktop | ||||
|  | ||||
| ## 3.4.0 - 2022-05-01 | ||||
|  | ||||
| ### Added | ||||
| - Message when Shibboleth entitlements need to be manually reviewed | ||||
| - Links to unofficial packages and repology in the readme | ||||
|  | ||||
| ### Changed | ||||
| - Increase minimum supported Python version to 3.9 | ||||
| - Support video listings with more columns | ||||
| - Use UTF-8 when reading/writing the config file | ||||
|  | ||||
| ### Fixed | ||||
| - Crash during authentication when the Shibboleth session is still valid | ||||
|  | ||||
| ## 3.3.1 - 2022-01-15 | ||||
|  | ||||
| ### Fixed | ||||
| - ILIAS login | ||||
| - Local video cache if `windows_paths` is enabled | ||||
|  | ||||
| ## 3.3.0 - 2022-01-09 | ||||
|  | ||||
| ### Added | ||||
| - A KIT IPD crawler | ||||
| - Support for ILIAS cards | ||||
| - (Rudimentary) support for content pages | ||||
| - Support for multi-stream videos | ||||
| - Support for ILIAS 7 | ||||
|  | ||||
| ### Removed | ||||
| - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file | ||||
|  | ||||
| ### Fixed | ||||
| - Crawling of recursive courses | ||||
| - Crawling files directly placed on the personal desktop | ||||
| - Ignore timestamps at the unix epoch as they crash on windows | ||||
|  | ||||
| ## 3.2.0 - 2021-08-04 | ||||
|  | ||||
| ### Added | ||||
| - `--skip` command line option | ||||
| - Support for ILIAS booking objects | ||||
|  | ||||
| ### Changed | ||||
| - Using multiple path segments on left side of `-name->` now results in an | ||||
|   error. This was already forbidden by the documentation but silently accepted | ||||
|   by PFERD. | ||||
| - More consistent path printing in some `--explain` messages | ||||
|  | ||||
| ### Fixed | ||||
| - Nondeterministic name deduplication due to ILIAS reordering elements | ||||
| - More exceptions are handled properly | ||||
|  | ||||
| ## 3.1.0 - 2021-06-13 | ||||
|  | ||||
| If your config file doesn't do weird things with transforms, it should continue | ||||
| to work. If your `-re->` arrows behave weirdly, try replacing them with | ||||
| `-exact-re->` arrows. If you're on Windows, you might need to switch from `\` | ||||
| path separators to `/` in your regex rules. | ||||
|  | ||||
| ### Added | ||||
| - `skip` option for crawlers | ||||
| - Rules with `>>` instead of `>` as arrow head | ||||
| - `-exact-re->` arrow (behaves like `-re->` did previously) | ||||
|  | ||||
| ### Changed | ||||
| - The `-re->` arrow can now rename directories (like `-->`) | ||||
| - Use `/` instead of `\` as path separator for (regex) rules on Windows | ||||
| - Use the label to the left for exercises instead of the button name to | ||||
|   determine the folder name | ||||
|  | ||||
| ### Fixed | ||||
| - Video pagination handling in ILIAS crawler | ||||
|  | ||||
| ## 3.0.1 - 2021-06-01 | ||||
|  | ||||
| ### Added | ||||
| - `credential-file` authenticator | ||||
| - `--credential-file` option for `kit-ilias-web` command | ||||
| - Warning if using concurrent tasks with `kit-ilias-web` | ||||
|  | ||||
| ### Changed | ||||
| - Cookies are now stored in a text-based format | ||||
|  | ||||
| ### Fixed | ||||
| - Date parsing now also works correctly in non-group exercises | ||||
|  | ||||
| ## 3.0.0 - 2021-05-31 | ||||
|  | ||||
| ### Added | ||||
| - Proper config files | ||||
| - Concurrent crawling | ||||
| - Crawl external ILIAS links | ||||
| - Crawl uploaded exercise solutions | ||||
| - Explain what PFERD is doing and why (`--explain`) | ||||
| - More control over output (`--status`, `--report`) | ||||
| - Debug transform rules with `--debug-transforms` | ||||
| - Print report after exiting via Ctrl+C | ||||
| - Store crawler reports in `.report` JSON file | ||||
| - Extensive config file documentation (`CONFIG.md`) | ||||
| - Documentation for developers (`DEV.md`) | ||||
| - This changelog | ||||
|  | ||||
| ### Changed | ||||
| - Rewrote almost everything | ||||
| - Better error messages | ||||
| - Redesigned CLI | ||||
| - Redesigned transform rules | ||||
| - ILIAS crawling logic (paths may be different) | ||||
| - Better support for weird paths on Windows | ||||
| - Set user agent (`PFERD/<version>`) | ||||
|  | ||||
| ### Removed | ||||
| - Backwards compatibility with 2.x | ||||
| - Python files as config files | ||||
| - Some types of crawlers | ||||
							
								
								
									
										472
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										472
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,472 @@ | ||||
| # Config file format | ||||
|  | ||||
| A config file consists of sections. A section begins with a `[section]` header, | ||||
| which is followed by a list of `key = value` pairs. Comments must be on their | ||||
| own line and start with `#`. Multiline values must be indented beyond their key. | ||||
| Boolean values can be `yes` or `no`. For more details and some examples on the | ||||
| format, see the [configparser documentation][1] ([interpolation][2] is | ||||
| disabled). | ||||
|  | ||||
| [1]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure" | ||||
| [2]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values" | ||||
|  | ||||
| ## The `DEFAULT` section | ||||
|  | ||||
| This section contains global configuration values. It can also be used to set | ||||
| default values for the other sections. | ||||
|  | ||||
| - `working_dir`: The directory PFERD operates in. Set to an absolute path to | ||||
|   make PFERD operate the same regardless of where it is executed from. All other | ||||
|   paths in the config file are interpreted relative to this path. If this path | ||||
|   is relative, it is interpreted relative to the script's working dir. `~` is | ||||
|   expanded to the current user's home directory. (Default: `.`) | ||||
| - `explain`: Whether PFERD should log and explain its actions and decisions in | ||||
|   detail. (Default: `no`) | ||||
| - `status`: Whether PFERD should print status updates (like `Crawled ...`, | ||||
|   `Added ...`) while running a crawler. (Default: `yes`) | ||||
| - `report`: Whether PFERD should print a report of added, changed and deleted | ||||
|    local files for all crawlers before exiting. (Default: `yes`) | ||||
| - `share_cookies`: Whether crawlers should share cookies where applicable. For | ||||
|   example, some crawlers share cookies if they crawl the same website using the | ||||
|   same account. (Default: `yes`) | ||||
|  | ||||
| ## The `crawl:*` sections | ||||
|  | ||||
| Sections whose names start with `crawl:` are used to configure crawlers. The | ||||
| rest of the section name specifies the name of the crawler. | ||||
|  | ||||
| A crawler synchronizes a remote resource to a local directory. There are | ||||
| different types of crawlers for different kinds of resources, e.g. ILIAS | ||||
| courses or lecture websites. | ||||
|  | ||||
| Each crawl section represents an instance of a specific type of crawler. The | ||||
| `type` option is used to specify the crawler type. The crawler's name is usually | ||||
| used as the output directory. New crawlers can be created simply by adding a new | ||||
| crawl section to the config file. | ||||
|  | ||||
| Depending on a crawler's type, it may have different options. For more details, | ||||
| see the type's [documentation](#crawler-types) below. The following options are | ||||
| common to all crawlers: | ||||
|  | ||||
| - `type`: The available types are specified in [this section](#crawler-types). | ||||
| - `skip`: Whether the crawler should be skipped during normal execution. The | ||||
|   crawler can still be executed manually using the `--crawler` or `-C` flags. | ||||
|   (Default: `no`) | ||||
| - `output_dir`: The directory the crawler synchronizes files to. A crawler will | ||||
|   never place any files outside this directory. (Default: the crawler's name) | ||||
| - `redownload`: When to download a file that is already present locally. | ||||
|   (Default: `never-smart`) | ||||
|     - `never`: If a file is present locally, it is not downloaded again. | ||||
|     - `never-smart`: Like `never`, but PFERD tries to detect if an already | ||||
|       downloaded files has changed via some (unreliable) heuristics. | ||||
|     - `always`: All files are always downloaded, regardless of whether they are | ||||
|       already present locally. | ||||
|     - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary | ||||
|       downloads via some (unreliable) heuristics. | ||||
| - `on_conflict`: What to do when the local and remote versions of a file or | ||||
|   directory differ, including when a file is replaced by a directory or a | ||||
|   directory by a file. (Default: `prompt`) | ||||
|     - `prompt`: Always ask the user before overwriting or deleting local files | ||||
|       and directories. | ||||
|     - `local-first`: Always keep the local file or directory. Equivalent to | ||||
|       using `prompt` and always choosing "no". Implies that `redownload` is set | ||||
|       to `never`. | ||||
|     - `remote-first`: Always keep the remote file or directory. Equivalent to | ||||
|       using `prompt` and always choosing "yes". | ||||
|     - `no-delete`: Never delete local files, but overwrite local files if the | ||||
|       remote file is different. | ||||
| - `transform`: Rules for renaming and excluding certain files and directories. | ||||
|   For more details, see [this section](#transformation-rules). (Default: empty) | ||||
| - `tasks`: The maximum number of concurrent tasks (such as crawling or | ||||
|   downloading). (Default: `1`) | ||||
| - `downloads`: How many of those tasks can be download tasks at the same time. | ||||
|   Must not be greater than `tasks`. (Default: Same as `tasks`) | ||||
| - `task_delay`: Time (in seconds) that the crawler should wait between | ||||
|   subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary | ||||
|   load for the crawl target. (Default: `0.0`) | ||||
| - `windows_paths`: Whether PFERD should find alternative names for paths that | ||||
|   are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) | ||||
|  | ||||
| Some crawlers may also require credentials for authentication. To configure how | ||||
| the crawler obtains its credentials, the `auth` option is used. It is set to the | ||||
| full name of an auth section (including the `auth:` prefix). | ||||
|  | ||||
| Here is a simple example: | ||||
|  | ||||
| ```ini | ||||
| [auth:example] | ||||
| type = simple | ||||
| username = foo | ||||
| password = bar | ||||
|  | ||||
| [crawl:something] | ||||
| type = some-complex-crawler | ||||
| auth = auth:example | ||||
| on_conflict = no-delete | ||||
| tasks = 3 | ||||
| ``` | ||||
|  | ||||
| ## The `auth:*` sections | ||||
|  | ||||
| Sections whose names start with `auth:` are used to configure authenticators. An | ||||
| authenticator provides a username and a password to one or more crawlers. | ||||
|  | ||||
| Authenticators work similar to crawlers: A section represents an authenticator | ||||
| instance whose name is the rest of the section name. The type is specified by | ||||
| the `type` option. | ||||
|  | ||||
| Depending on an authenticator's type, it may have different options. For more | ||||
| details, see the type's [documentation](#authenticator-types) below. The only | ||||
| option common to all authenticators is `type`: | ||||
|  | ||||
| - `type`: The types are specified in [this section](#authenticator-types). | ||||
|  | ||||
| ## Crawler types | ||||
|  | ||||
| ### The `local` crawler | ||||
|  | ||||
| This crawler crawls a local directory. It is really simple and mostly useful for | ||||
| testing different setups. The various delay options are meant to make the | ||||
| crawler simulate a slower, network-based crawler. | ||||
|  | ||||
| - `target`: Path to the local directory to crawl. (Required) | ||||
| - `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests. | ||||
|   (Default: `0.0`) | ||||
| - `download_delay`: Artificial delay (in seconds) to simulate for download | ||||
|   requests. (Default: `0.0`) | ||||
| - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) | ||||
|  | ||||
| ### The `kit-ipd` crawler | ||||
|  | ||||
| This crawler crawls a KIT-IPD page by url. The root page can be crawled from | ||||
| outside the KIT network so you will be informed about any new/deleted files, | ||||
| but downloading files requires you to be within. Adding a show delay between | ||||
| requests is likely a good idea. | ||||
|  | ||||
| - `target`: URL to a KIT-IPD page | ||||
| - `link_regex`: A regex that is matched against the `href` part of links. If it | ||||
|   matches, the given link is downloaded as a file. This is used to extract | ||||
|   files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) | ||||
|  | ||||
| ### The `kit-ilias-web` crawler | ||||
|  | ||||
| This crawler crawls the KIT ILIAS instance. | ||||
|  | ||||
| ILIAS is not great at handling too many concurrent requests. To avoid | ||||
| unnecessary load, please limit `tasks` to `1`. | ||||
|  | ||||
| There is a spike in ILIAS usage at the beginning of lectures, so please don't | ||||
| run PFERD during those times. | ||||
|  | ||||
| If you're automatically running PFERD periodically (e. g. via cron or a systemd | ||||
| timer), please randomize the start time or at least don't use the full hour. For | ||||
| systemd timers, this can be accomplished using the `RandomizedDelaySec` option. | ||||
| Also, please schedule the script to run in periods of low activity. Running the | ||||
| script once per day should be fine. | ||||
|  | ||||
| - `target`: The ILIAS element to crawl. (Required) | ||||
|     - `desktop`: Crawl your personal desktop | ||||
|     - `<course id>`: Crawl the course with the given id | ||||
|     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||
|       at the bottom of its ILIAS page) | ||||
| - `auth`: Name of auth section to use for login. (Required) | ||||
| - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||
|   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||
| - `links`: How to represent external links. (Default: `fancy`) | ||||
|     - `ignore`: Don't download links. | ||||
|     - `plaintext`: A text file containing only the URL. | ||||
|     - `fancy`: A HTML file looking like the ILIAS link element. | ||||
|     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||
| - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||
|   redirect to the actual URL. Set to a negative value to disable the automatic | ||||
|   redirect. (Default: `-1`) | ||||
| - `videos`: Whether to download videos. (Default: `no`) | ||||
| - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||
|   `20.0`) | ||||
|  | ||||
| ## Authenticator types | ||||
|  | ||||
| ### The `simple` authenticator | ||||
|  | ||||
| With this authenticator, the username and password can be set directly in the | ||||
| config file. If the username or password are not specified, the user is prompted | ||||
| via the terminal. | ||||
|  | ||||
| - `username`: The username. (Optional) | ||||
| - `password`: The password. (Optional) | ||||
|  | ||||
| ### The `credential-file` authenticator | ||||
|  | ||||
| This authenticator reads a username and a password from a credential file. | ||||
|  | ||||
| - `path`: Path to the credential file. (Required) | ||||
|  | ||||
| The credential file has exactly two lines (trailing newline optional). The first | ||||
| line starts with `username=` and contains the username, the second line starts | ||||
| with `password=` and contains the password. The username and password may | ||||
| contain any characters except a line break. | ||||
|  | ||||
| ``` | ||||
| username=AzureDiamond | ||||
| password=hunter2 | ||||
| ``` | ||||
|  | ||||
| ### The `keyring` authenticator | ||||
|  | ||||
| This authenticator uses the system keyring to store passwords. The username can | ||||
| be set directly in the config file. If the username is not specified, the user | ||||
| is prompted via the terminal. If the keyring contains no entry or the entry is | ||||
| incorrect, the user is prompted for a password via the terminal and the password | ||||
| is stored in the keyring. | ||||
|  | ||||
| - `username`: The username. (Optional) | ||||
| - `keyring_name`: The service name PFERD uses for storing credentials. (Default: | ||||
|   `PFERD`) | ||||
|  | ||||
| ### The `pass` authenticator | ||||
|  | ||||
| This authenticator queries the [`pass` password manager][3] for a username and | ||||
| password. It tries to be mostly compatible with [browserpass][4] and | ||||
| [passff][5], so see those links for an overview of the format. If PFERD fails | ||||
| to load your password, you can use the `--explain` flag to see why. | ||||
|  | ||||
| - `passname`: The name of the password to use (Required) | ||||
| - `username_prefixes`: A comma-separated list of username line prefixes | ||||
|   (Default: `login,username,user`) | ||||
| - `password_prefixes`: A comma-separated list of password line prefixes | ||||
|   (Default: `password,pass,secret`) | ||||
|  | ||||
| [3]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager" | ||||
| [4]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store" | ||||
| [5]: <https://github.com/passff/passff#multi-line-format> "Multi-line format" | ||||
|  | ||||
| ### The `tfa` authenticator | ||||
|  | ||||
| This authenticator prompts the user on the console for a two-factor | ||||
| authentication token. The token is provided as password and it is not cached. | ||||
| This authenticator does not support usernames. | ||||
|  | ||||
| ## Transformation rules | ||||
|  | ||||
| Transformation rules are rules for renaming and excluding files and directories. | ||||
| They are specified line-by-line in a crawler's `transform` option. When a | ||||
| crawler needs to apply a rule to a path, it goes through this list top-to-bottom | ||||
| and applies the first matching rule. | ||||
|  | ||||
| To see this process in action, you can use the `--debug-transforms` or flag or | ||||
| the `--explain` flag. | ||||
|  | ||||
| Each rule has the format `SOURCE ARROW TARGET` (e. g. `foo/bar --> foo/baz`). | ||||
| The arrow specifies how the source and target are interpreted. The different | ||||
| kinds of arrows are documented below. | ||||
|  | ||||
| `SOURCE` and `TARGET` are either a bunch of characters without spaces (e. g. | ||||
| `foo/bar`) or string literals (e. g, `"foo/b a r"`). The former syntax has no | ||||
| concept of escaping characters, so the backslash is just another character. The | ||||
| string literals however support Python's escape syntax (e. g. | ||||
| `"foo\\bar\tbaz"`). This also means that in string literals, backslashes must be | ||||
| escaped. | ||||
|  | ||||
| `TARGET` can additionally be a single exclamation mark `!` (*not* `"!"`). When a | ||||
| rule with a `!` as target matches a path, the corresponding file or directory is | ||||
| ignored by the crawler instead of renamed. | ||||
|  | ||||
| `TARGET` can also be omitted entirely. When a rule without target matches a | ||||
| path, the path is returned unmodified. This is useful to prevent rules further | ||||
| down from matching instead. | ||||
|  | ||||
| Each arrow's behaviour can be modified slightly by changing the arrow's head | ||||
| from `>` to `>>`. When a rule with a `>>` arrow head matches a path, it doesn't | ||||
| return immediately like a normal arrow. Instead, it replaces the current path | ||||
| with its output and continues on to the next rule. In effect, this means that | ||||
| multiple rules can be applied sequentially. | ||||
|  | ||||
| ### The `-->` arrow | ||||
|  | ||||
| The `-->` arrow is a basic renaming operation for files and directories. If a | ||||
| path matches `SOURCE`, it is renamed to `TARGET`. | ||||
|  | ||||
| Example: `foo/bar --> baz` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Converts `foo/bar` into `baz` | ||||
| - Converts `foo/bar/wargl` into `bar/wargl` | ||||
|  | ||||
| Example: `foo/bar --> !` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Ignores `foo/bar` and any of its children | ||||
|  | ||||
| ### The `-name->` arrow | ||||
|  | ||||
| The `-name->` arrow lets you rename files and directories by their name, | ||||
| regardless of where they appear in the file tree. Because of this, its `SOURCE` | ||||
| must not contain multiple path segments, only a single name. This restriction | ||||
| does not apply to its `TARGET`. | ||||
|  | ||||
| Example: `foo -name-> bar/baz` | ||||
| - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||
| - Converts `hello/foo` into `hello/bar/baz` | ||||
| - Converts `foo/world` into `bar/baz/world` | ||||
| - Converts `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz` | ||||
|  | ||||
| Example: `foo -name-> !` | ||||
| - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||
| - Ignores any path containing a segment `foo` | ||||
|  | ||||
| ### The `-exact->` arrow | ||||
|  | ||||
| The `-exact->` arrow requires the path to match `SOURCE` exactly. The examples | ||||
| below show why this is useful. | ||||
|  | ||||
| Example: `foo/bar -exact-> baz` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Converts `foo/bar` into `baz` | ||||
| - Doesn't match `foo/bar/wargl` | ||||
|  | ||||
| Example: `foo/bar -exact-> !` | ||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||
| - Ignores only `foo/bar`, not its children | ||||
|  | ||||
| ### The `-re->` arrow | ||||
|  | ||||
| The `-re->` arrow is like the `-->` arrow but with regular expressions. `SOURCE` | ||||
| is a regular expression and `TARGET` an f-string based template. If a path | ||||
| matches `SOURCE`, the output path is created using `TARGET` as template. | ||||
| `SOURCE` is automatically anchored. | ||||
|  | ||||
| `TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can | ||||
| be referred to as `{g<n>}` (e.g. `{g3}`). `{g0}` refers to the original path. | ||||
| If capturing group *n*'s contents are a valid integer, the integer value is | ||||
| available as `{i<n>}` (e.g. `{i3}`). If capturing group *n*'s contents are a | ||||
| valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). If a | ||||
| capturing group is not present (e.g. when matching the string `cd` with the | ||||
| regex `(ab)?cd`), the corresponding variables are not defined. | ||||
|  | ||||
| Python's format string syntax has rich options for formatting its arguments. For | ||||
| example, to left-pad the capturing group 3 with the digit `0` to width 5, you | ||||
| can use `{i3:05}`. | ||||
|  | ||||
| PFERD even allows you to write entire expressions inside the curly braces, for | ||||
| example `{g2.lower()}` or `{g3.replace(' ', '_')}`. | ||||
|  | ||||
| Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` | ||||
| - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||
| - Converts `foo/bar` into `BOOH/fear` | ||||
| - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||
| - Converts `foo/bar/baz` into `BOOH/fear/baz` | ||||
|  | ||||
| [6]: <https://docs.python.org/3/library/string.html#format-string-syntax> "Format String Syntax" | ||||
|  | ||||
| ### The `-name-re->` arrow | ||||
|  | ||||
| The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. | ||||
|  | ||||
| Example: `(.*)\.jpeg -name-re-> {g1}.jpg` | ||||
| - Doesn't match `foo/bar.png`, `baz.JPEG` or `hello,jpeg` | ||||
| - Converts `foo/bar.jpeg` into `foo/bar.jpg` | ||||
| - Converts `foo.jpeg/bar/baz.jpeg` into `foo.jpg/bar/baz.jpg` | ||||
|  | ||||
| Example: `\..+ -name-re-> !` | ||||
| - Doesn't match `.`, `test`, `a.b` | ||||
| - Ignores all files and directories starting with `.`. | ||||
|  | ||||
| ### The `-exact-re->` arrow | ||||
|  | ||||
| The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` | ||||
| arrows. | ||||
|  | ||||
| Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` | ||||
| - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||
| - Converts `foo/bar` into `BOOH/fear` | ||||
| - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||
| - Doesn't match `foo/bar/baz` | ||||
|  | ||||
| ### Example: Tutorials | ||||
|  | ||||
| You have an ILIAS course with lots of tutorials, but are only interested in a | ||||
| single one. | ||||
|  | ||||
| ``` | ||||
| tutorials/ | ||||
|   |- tut_01/ | ||||
|   |- tut_02/ | ||||
|   |- tut_03/ | ||||
|   ... | ||||
| ``` | ||||
|  | ||||
| You can use a mix of normal and exact arrows to get rid of the other ones and | ||||
| move the `tutorials/tut_02/` folder to `my_tut/`: | ||||
|  | ||||
| ``` | ||||
| tutorials/tut_02 --> my_tut | ||||
| tutorials -exact-> | ||||
| tutorials --> ! | ||||
| ``` | ||||
|  | ||||
| The second rule is required for many crawlers since they use the rules to decide | ||||
| which directories to crawl. If it was missing when the crawler looks at | ||||
| `tutorials/`, the third rule would match. This means the crawler would not crawl | ||||
| the `tutorials/` directory and thus not discover that `tutorials/tut02/` exists. | ||||
|  | ||||
| Since the second rule is only relevant for crawling, the `TARGET` is left out. | ||||
|  | ||||
| ### Example: Lecture slides | ||||
|  | ||||
| You have a course with slides like `Lecture 3: Linear functions.PDF` and you | ||||
| would like to rename them to `03_linear_functions.pdf`. | ||||
|  | ||||
| ``` | ||||
| Lectures/ | ||||
|   |- Lecture 1: Introduction.PDF | ||||
|   |- Lecture 2: Vectors and matrices.PDF | ||||
|   |- Lecture 3: Linear functions.PDF | ||||
|   ... | ||||
| ``` | ||||
|  | ||||
| To do this, you can use the most powerful of arrows: The regex arrow. | ||||
|  | ||||
| ``` | ||||
| "Lectures/Lecture (\\d+): (.*)\\.PDF" -re-> "Lectures/{i1:02}_{g2.lower().replace(' ', '_')}.pdf" | ||||
| ``` | ||||
|  | ||||
| Note the escaped backslashes on the `SOURCE` side. | ||||
|  | ||||
| ### Example: Crawl a Python project | ||||
|  | ||||
| You are crawling a Python project and want to ignore all hidden files (files | ||||
| whose name starts with a `.`), all `__pycache__` directories and all markdown | ||||
| files (for some weird reason). | ||||
|  | ||||
| ``` | ||||
| .gitignore | ||||
| .mypy_cache/ | ||||
| .venv/ | ||||
| CONFIG.md | ||||
| PFERD/ | ||||
|   |- __init__.py | ||||
|   |- __main__.py | ||||
|   |- __pycache__/ | ||||
|   |- authenticator.py | ||||
|   |- config.py | ||||
|   ... | ||||
| README.md | ||||
| ... | ||||
| ``` | ||||
|  | ||||
| For this task, the name arrows can be used. | ||||
|  | ||||
| ``` | ||||
| \..*        -name-re-> ! | ||||
| __pycache__ -name->    ! | ||||
| .*\.md      -name-re-> ! | ||||
| ``` | ||||
|  | ||||
| ### Example: Clean up names | ||||
|  | ||||
| You want to convert all paths into lowercase and replace spaces with underscores | ||||
| before applying any rules. This can be achieved using the `>>` arrow heads. | ||||
|  | ||||
| ``` | ||||
| (.*) -re->> "{g1.lower().replace(' ', '_')}" | ||||
|  | ||||
| <other rules go here> | ||||
| ``` | ||||
							
								
								
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,89 @@ | ||||
| # PFERD Development Guide | ||||
|  | ||||
| PFERD is packaged following the [Python Packaging User Guide][ppug] (in | ||||
| particular [this][ppug-1] and [this][ppug-2] guide). | ||||
|  | ||||
| [ppug]: <https://packaging.python.org/> "Python Packaging User Guide" | ||||
| [ppug-1]: <https://packaging.python.org/tutorials/packaging-projects/> "Packaging Python Projects" | ||||
| [ppug-2]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/> "Packaging and distributing projects" | ||||
|  | ||||
| ## Setting up a dev environment | ||||
|  | ||||
| The use of [venv][venv] is recommended. To initially set up a development | ||||
| environment, run these commands in the same directory as this file: | ||||
|  | ||||
| ``` | ||||
| $ python -m venv .venv | ||||
| $ . .venv/bin/activate | ||||
| $ ./scripts/setup | ||||
| ``` | ||||
|  | ||||
| The setup script installs a few required dependencies and tools. It also | ||||
| installs PFERD via `pip install --editable .`, which means that you can just run | ||||
| `pferd` as if it was installed normally. Since PFERD was installed with | ||||
| `--editable`, there is no need to re-run `pip install` when the source code is | ||||
| changed. | ||||
|  | ||||
| If you get any errors because pip can't update itself, try running | ||||
| `./scripts/setup --no-pip` instead of `./scripts/setup`. | ||||
|  | ||||
| For more details, see [this part of the Python Tutorial][venv-tut] and | ||||
| [this section on "development mode"][ppug-dev]. | ||||
|  | ||||
| [venv]: <https://docs.python.org/3/library/venv.html> "venv - Creation of virtual environments" | ||||
| [venv-tut]: <https://docs.python.org/3/tutorial/venv.html> "12. Virtual Environments and Packages" | ||||
| [ppug-dev]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/#working-in-development-mode> "Working in “development mode”" | ||||
|  | ||||
| ## Checking and formatting the code | ||||
|  | ||||
| To run a set of checks against the code, run `./scripts/check` in the repo's | ||||
| root directory. This script will run a few tools installed by `./scripts/setup` | ||||
| against the entire project. | ||||
|  | ||||
| To format the code, run `./scripts/format` in the repo's root directory. | ||||
|  | ||||
| Before committing changes, please make sure the checks return no warnings and | ||||
| the code is formatted. | ||||
|  | ||||
| ## Contributing | ||||
|  | ||||
| When submitting a PR that adds, changes or modifies a feature, please ensure | ||||
| that the corresponding documentation is updated as well. Also, please ensure | ||||
| that `./scripts/check` returns no warnings and the code has been run through | ||||
| `./scripts/format`. | ||||
|  | ||||
| In your first PR, please add your name to the `LICENSE` file. | ||||
|  | ||||
| ## Releasing a new version | ||||
|  | ||||
| This section describes the steps required to release a new version of PFERD. | ||||
| Usually, they don't need to performed manually and `scripts/bump-version` can be | ||||
| used instead. | ||||
|  | ||||
| 1. Update the version number in `PFERD/version.py` | ||||
| 2. Update `CHANGELOG.md` | ||||
| 3. Commit changes to `master` with message `Bump version to <version>` (e. g. `Bump version to 3.2.5`) | ||||
| 4. Create annotated tag named `v<version>` (e. g. `v3.2.5`) | ||||
|     - Copy changes from changelog | ||||
|     - Remove `#` symbols (which git would interpret as comments) | ||||
|     - As the first line, add `Version <version> - <date>` (e. g. `Version 3.2.5 - 2021-05-24`) | ||||
|     - Leave the second line empty | ||||
| 5. Fast-forward `latest` to `master` | ||||
| 6. Push `master`, `latest` and the new tag | ||||
|  | ||||
| Example tag annotation: | ||||
| ``` | ||||
| Version 3.2.5 - 2021-05-24 | ||||
|  | ||||
| Added | ||||
| - Support for concurrent downloads | ||||
| - Support for proper config files | ||||
| - This changelog | ||||
|  | ||||
| Changed | ||||
| - Rewrote almost everything | ||||
| - Redesigned CLI | ||||
|  | ||||
| Removed | ||||
| - Backwards compatibility with 2.x | ||||
| ``` | ||||
							
								
								
									
										19
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||
|                     TheChristophe, Scriptim, thelukasprobst, Toorero | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
| this software and associated documentation files (the "Software"), to deal in | ||||
| the Software without restriction, including without limitation the rights to | ||||
| use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||||
| the Software, and to permit persons to whom the Software is furnished to do so, | ||||
| subject to the following conditions: | ||||
|  | ||||
| The above copyright notice and this permission notice shall be included in all | ||||
| copies or substantial portions of the Software. | ||||
|  | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||||
| FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||||
| COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||||
| IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||||
| CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||||
| @@ -1,37 +0,0 @@ | ||||
| import logging | ||||
|  | ||||
| from .ffm import * | ||||
| from .ilias import * | ||||
| from .norbert import * | ||||
| from .ti import * | ||||
| from .utils import * | ||||
|  | ||||
| __all__ = ["STYLE", "FORMAT", "DATE_FORMAT", "FORMATTER", "enable_logging"] | ||||
|  | ||||
| __all__ += ffm.__all__ | ||||
| __all__ += ilias.__all__ | ||||
| __all__ += norbert.__all__ | ||||
| __all__ += ti.__all__ | ||||
| __all__ += utils.__all__ | ||||
|  | ||||
| STYLE = "{" | ||||
| FORMAT = "[{levelname:<7}] {message}" | ||||
| DATE_FORMAT = "%F %T" | ||||
|  | ||||
| FORMATTER = logging.Formatter( | ||||
|         fmt=FORMAT, | ||||
|         datefmt=DATE_FORMAT, | ||||
|         style=STYLE, | ||||
| ) | ||||
|  | ||||
| def enable_logging(name="PFERD", level=logging.INFO): | ||||
|     handler = logging.StreamHandler() | ||||
|     handler.setFormatter(FORMATTER) | ||||
|  | ||||
|     logger = logging.getLogger(name) | ||||
|     logger.setLevel(level) | ||||
|     logger.addHandler(handler) | ||||
|  | ||||
|     # This should be logged by our own handler, and not the root logger's | ||||
|     # default handler, so we don't pass it on to the root logger. | ||||
|     logger.propagate = False | ||||
|   | ||||
							
								
								
									
										172
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,172 @@ | ||||
| import argparse | ||||
| import asyncio | ||||
| import configparser | ||||
| import os | ||||
| import sys | ||||
| from pathlib import Path | ||||
|  | ||||
| from PFERD.update import check_for_updates | ||||
|  | ||||
| from .auth import AuthLoadError | ||||
| from .cli import PARSER, ParserLoadError, load_default_section | ||||
| from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError | ||||
| from .logging import log | ||||
| from .pferd import Pferd, PferdLoadError | ||||
| from .transformer import RuleParseError | ||||
|  | ||||
|  | ||||
| def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: | ||||
|     log.explain_topic("Loading config") | ||||
|     parser = configparser.ConfigParser(interpolation=None) | ||||
|  | ||||
|     if args.command is None: | ||||
|         log.explain("No CLI command specified, loading config from file") | ||||
|         Config.load_parser(parser, path=args.config) | ||||
|     else: | ||||
|         log.explain("CLI command specified, loading config from its arguments") | ||||
|         if args.command: | ||||
|             args.command(args, parser) | ||||
|  | ||||
|     load_default_section(args, parser) | ||||
|  | ||||
|     return parser | ||||
|  | ||||
|  | ||||
| def load_config(args: argparse.Namespace) -> Config: | ||||
|     try: | ||||
|         return Config(load_config_parser(args)) | ||||
|     except ConfigLoadError as e: | ||||
|         log.error(str(e)) | ||||
|         log.error_contd(e.reason) | ||||
|         sys.exit(1) | ||||
|     except ParserLoadError as e: | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|  | ||||
|  | ||||
| def configure_logging_from_args(args: argparse.Namespace) -> None: | ||||
|     if args.explain is not None: | ||||
|         log.output_explain = args.explain | ||||
|     if args.status is not None: | ||||
|         log.output_status = args.status | ||||
|     if args.report is not None: | ||||
|         log.output_report = args.report | ||||
|  | ||||
|     # We want to prevent any unnecessary output if we're printing the config to | ||||
|     # stdout, otherwise it would not be a valid config file. | ||||
|     if args.dump_config_to == "-": | ||||
|         log.output_explain = False | ||||
|         log.output_status = False | ||||
|         log.output_report = False | ||||
|  | ||||
|  | ||||
| def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None: | ||||
|     # In configure_logging_from_args(), all normal logging is already disabled | ||||
|     # whenever we dump the config. We don't want to override that decision with | ||||
|     # values from the config file. | ||||
|     if args.dump_config_to == "-": | ||||
|         return | ||||
|  | ||||
|     try: | ||||
|         if args.explain is None: | ||||
|             log.output_explain = config.default_section.explain() | ||||
|         if args.status is None: | ||||
|             log.output_status = config.default_section.status() | ||||
|         if args.report is None: | ||||
|             log.output_report = config.default_section.report() | ||||
|     except ConfigOptionError as e: | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|  | ||||
|  | ||||
| def dump_config(args: argparse.Namespace, config: Config) -> None: | ||||
|     log.explain_topic("Dumping config") | ||||
|  | ||||
|     if args.dump_config and args.dump_config_to is not None: | ||||
|         log.error("--dump-config and --dump-config-to can't be specified at the same time") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     try: | ||||
|         if args.dump_config: | ||||
|             config.dump() | ||||
|         elif args.dump_config_to == "-": | ||||
|             config.dump_to_stdout() | ||||
|         else: | ||||
|             config.dump(Path(args.dump_config_to)) | ||||
|     except ConfigDumpError as e: | ||||
|         log.error(str(e)) | ||||
|         log.error_contd(e.reason) | ||||
|         sys.exit(1) | ||||
|  | ||||
|  | ||||
| def main() -> None: | ||||
|     args = PARSER.parse_args() | ||||
|  | ||||
|     # Configuring logging happens in two stages because CLI args have | ||||
|     # precedence over config file options and loading the config already | ||||
|     # produces some kinds of log messages (usually only explain()-s). | ||||
|     configure_logging_from_args(args) | ||||
|  | ||||
|     config = load_config(args) | ||||
|  | ||||
|     # Now, after loading the config file, we can apply its logging settings in | ||||
|     # all places that were not already covered by CLI args. | ||||
|     configure_logging_from_config(args, config) | ||||
|  | ||||
|     if args.dump_config or args.dump_config_to is not None: | ||||
|         dump_config(args, config) | ||||
|         sys.exit() | ||||
|  | ||||
|     try: | ||||
|         pferd = Pferd(config, args.crawler, args.skip) | ||||
|     except PferdLoadError as e: | ||||
|         log.unlock() | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|  | ||||
|     try: | ||||
|         if os.name == "nt": | ||||
|             # A "workaround" for the windows event loop somehow crashing after | ||||
|             # asyncio.run() completes. See: | ||||
|             # https://bugs.python.org/issue39232 | ||||
|             # https://github.com/encode/httpx/issues/914#issuecomment-780023632 | ||||
|             # TODO Fix this properly | ||||
|             loop = asyncio.get_event_loop() | ||||
|             loop.run_until_complete(pferd.run(args.debug_transforms)) | ||||
|             loop.run_until_complete(asyncio.sleep(1)) | ||||
|             loop.close() | ||||
|         else: | ||||
|             log.explain_topic("Checking for updates") | ||||
|             if not args.skip_update_check: | ||||
|                 asyncio.run(check_for_updates()) | ||||
|             else: | ||||
|                 log.explain("Update check skipped due to configuration option") | ||||
|             asyncio.run(pferd.run(args.debug_transforms)) | ||||
|     except (ConfigOptionError, AuthLoadError) as e: | ||||
|         log.unlock() | ||||
|         log.error(str(e)) | ||||
|         sys.exit(1) | ||||
|     except RuleParseError as e: | ||||
|         log.unlock() | ||||
|         e.pretty_print() | ||||
|         sys.exit(1) | ||||
|     except KeyboardInterrupt: | ||||
|         log.unlock() | ||||
|         log.explain_topic("Interrupted, exiting immediately") | ||||
|         log.explain("Open files and connections are left for the OS to clean up") | ||||
|         pferd.print_report() | ||||
|         # TODO Clean up tmp files | ||||
|         # And when those files *do* actually get cleaned up properly, | ||||
|         # reconsider if this should really exit with 1 | ||||
|         sys.exit(1) | ||||
|     except Exception: | ||||
|         log.unlock() | ||||
|         log.unexpected_exception() | ||||
|         pferd.print_report() | ||||
|         sys.exit(1) | ||||
|     else: | ||||
|         pferd.print_report() | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| from configparser import SectionProxy | ||||
| from typing import Callable, Dict | ||||
|  | ||||
| from ..config import Config | ||||
| from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection  # noqa: F401 | ||||
| from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection | ||||
| from .keyring import KeyringAuthenticator, KeyringAuthSection | ||||
| from .pass_ import PassAuthenticator, PassAuthSection | ||||
| from .simple import SimpleAuthenticator, SimpleAuthSection | ||||
| from .tfa import TfaAuthenticator | ||||
|  | ||||
| AuthConstructor = Callable[[ | ||||
|     str,                # Name (without the "auth:" prefix) | ||||
|     SectionProxy,       # Authenticator's section of global config | ||||
|     Config,             # Global config | ||||
| ], Authenticator] | ||||
|  | ||||
| AUTHENTICATORS: Dict[str, AuthConstructor] = { | ||||
|     "credential-file": lambda n, s, c: | ||||
|         CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), | ||||
|     "keyring": lambda n, s, c: | ||||
|         KeyringAuthenticator(n, KeyringAuthSection(s)), | ||||
|     "pass": lambda n, s, c: | ||||
|         PassAuthenticator(n, PassAuthSection(s)), | ||||
|     "simple": lambda n, s, c: | ||||
|         SimpleAuthenticator(n, SimpleAuthSection(s)), | ||||
|     "tfa": lambda n, s, c: | ||||
|         TfaAuthenticator(n), | ||||
| } | ||||
							
								
								
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| from abc import ABC, abstractmethod | ||||
| from typing import Tuple | ||||
|  | ||||
| from ..config import Section | ||||
|  | ||||
|  | ||||
| class AuthLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class AuthError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class AuthSection(Section): | ||||
|     def type(self) -> str: | ||||
|         value = self.s.get("type") | ||||
|         if value is None: | ||||
|             self.missing_value("type") | ||||
|         return value | ||||
|  | ||||
|  | ||||
| class Authenticator(ABC): | ||||
|     def __init__(self, name: str) -> None: | ||||
|         """ | ||||
|         Initialize an authenticator from its name and its section in the config | ||||
|         file. | ||||
|  | ||||
|         If you are writing your own constructor for your own authenticator, | ||||
|         make sure to call this constructor first (via super().__init__). | ||||
|  | ||||
|         May throw an AuthLoadError. | ||||
|         """ | ||||
|  | ||||
|         self.name = name | ||||
|  | ||||
|     @abstractmethod | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         pass | ||||
|  | ||||
|     async def username(self) -> str: | ||||
|         username, _ = await self.credentials() | ||||
|         return username | ||||
|  | ||||
|     async def password(self) -> str: | ||||
|         _, password = await self.credentials() | ||||
|         return password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         """ | ||||
|         Tell the authenticator that some or all of its credentials are invalid. | ||||
|  | ||||
|         Authenticators should overwrite this function if they have a way to | ||||
|         deal with this issue that is likely to result in valid credentials | ||||
|         (e. g. prompting the user). | ||||
|         """ | ||||
|  | ||||
|         raise AuthError("Invalid credentials") | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         """ | ||||
|         Tell the authenticator that specifically its username is invalid. | ||||
|  | ||||
|         Authenticators should overwrite this function if they have a way to | ||||
|         deal with this issue that is likely to result in valid credentials | ||||
|         (e. g. prompting the user). | ||||
|         """ | ||||
|  | ||||
|         raise AuthError("Invalid username") | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         """ | ||||
|         Tell the authenticator that specifically its password is invalid. | ||||
|  | ||||
|         Authenticators should overwrite this function if they have a way to | ||||
|         deal with this issue that is likely to result in valid credentials | ||||
|         (e. g. prompting the user). | ||||
|         """ | ||||
|  | ||||
|         raise AuthError("Invalid password") | ||||
							
								
								
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | ||||
| from pathlib import Path | ||||
| from typing import Tuple | ||||
|  | ||||
| from ..config import Config | ||||
| from ..utils import fmt_real_path | ||||
| from .authenticator import Authenticator, AuthLoadError, AuthSection | ||||
|  | ||||
|  | ||||
| class CredentialFileAuthSection(AuthSection): | ||||
|     def path(self) -> Path: | ||||
|         value = self.s.get("path") | ||||
|         if value is None: | ||||
|             self.missing_value("path") | ||||
|         return Path(value) | ||||
|  | ||||
|  | ||||
| class CredentialFileAuthenticator(Authenticator): | ||||
|     def __init__(self, name: str, section: CredentialFileAuthSection, config: Config) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         path = config.default_section.working_dir() / section.path() | ||||
|         try: | ||||
|             with open(path, encoding="utf-8") as f: | ||||
|                 lines = list(f) | ||||
|         except UnicodeDecodeError: | ||||
|             raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8") | ||||
|         except OSError as e: | ||||
|             raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e | ||||
|  | ||||
|         if len(lines) != 2: | ||||
|             raise AuthLoadError("Credential file must be two lines long") | ||||
|         [uline, pline] = lines | ||||
|         uline = uline[:-1]  # Remove trailing newline | ||||
|         if pline.endswith("\n"): | ||||
|             pline = pline[:-1] | ||||
|  | ||||
|         if not uline.startswith("username="): | ||||
|             raise AuthLoadError("First line must start with 'username='") | ||||
|         if not pline.startswith("password="): | ||||
|             raise AuthLoadError("Second line must start with 'password='") | ||||
|  | ||||
|         self._username = uline[9:] | ||||
|         self._password = pline[9:] | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         return self._username, self._password | ||||
							
								
								
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,65 @@ | ||||
| from typing import Optional, Tuple | ||||
|  | ||||
| import keyring | ||||
|  | ||||
| from ..logging import log | ||||
| from ..utils import agetpass, ainput | ||||
| from ..version import NAME | ||||
| from .authenticator import Authenticator, AuthError, AuthSection | ||||
|  | ||||
|  | ||||
| class KeyringAuthSection(AuthSection): | ||||
|     def username(self) -> Optional[str]: | ||||
|         return self.s.get("username") | ||||
|  | ||||
|     def keyring_name(self) -> str: | ||||
|         return self.s.get("keyring_name", fallback=NAME) | ||||
|  | ||||
|  | ||||
| class KeyringAuthenticator(Authenticator): | ||||
|  | ||||
|     def __init__(self, name: str, section: KeyringAuthSection) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         self._username = section.username() | ||||
|         self._password: Optional[str] = None | ||||
|         self._keyring_name = section.keyring_name() | ||||
|  | ||||
|         self._password_invalidated = False | ||||
|         self._username_fixed = section.username() is not None | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         # Request the username | ||||
|         if self._username is None: | ||||
|             async with log.exclusive_output(): | ||||
|                 self._username = await ainput("Username: ") | ||||
|  | ||||
|         # First try looking it up in the keyring. | ||||
|         # Do not look it up if it was invalidated - we want to re-prompt in this case | ||||
|         if self._password is None and not self._password_invalidated: | ||||
|             self._password = keyring.get_password(self._keyring_name, self._username) | ||||
|  | ||||
|         # If that fails it wasn't saved in the keyring - we need to | ||||
|         # read it from the user and store it | ||||
|         if self._password is None: | ||||
|             async with log.exclusive_output(): | ||||
|                 self._password = await agetpass("Password: ") | ||||
|                 keyring.set_password(self._keyring_name, self._username, self._password) | ||||
|  | ||||
|         self._password_invalidated = False | ||||
|         return self._username, self._password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         if not self._username_fixed: | ||||
|             self.invalidate_username() | ||||
|         self.invalidate_password() | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         if self._username_fixed: | ||||
|             raise AuthError("Configured username is invalid") | ||||
|         else: | ||||
|             self._username = None | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         self._password = None | ||||
|         self._password_invalidated = True | ||||
							
								
								
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,98 @@ | ||||
| import re | ||||
| import subprocess | ||||
| from typing import List, Tuple | ||||
|  | ||||
| from ..logging import log | ||||
| from .authenticator import Authenticator, AuthError, AuthSection | ||||
|  | ||||
|  | ||||
| class PassAuthSection(AuthSection): | ||||
|     def passname(self) -> str: | ||||
|         if (value := self.s.get("passname")) is None: | ||||
|             self.missing_value("passname") | ||||
|         return value | ||||
|  | ||||
|     def username_prefixes(self) -> List[str]: | ||||
|         value = self.s.get("username_prefixes", "login,username,user") | ||||
|         return [prefix.lower() for prefix in value.split(",")] | ||||
|  | ||||
|     def password_prefixes(self) -> List[str]: | ||||
|         value = self.s.get("password_prefixes", "password,pass,secret") | ||||
|         return [prefix.lower() for prefix in value.split(",")] | ||||
|  | ||||
|  | ||||
| class PassAuthenticator(Authenticator): | ||||
|     PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)"  # to be used with fullmatch | ||||
|  | ||||
|     def __init__(self, name: str, section: PassAuthSection) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         self._passname = section.passname() | ||||
|         self._username_prefixes = section.username_prefixes() | ||||
|         self._password_prefixes = section.password_prefixes() | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         log.explain_topic("Obtaining credentials from pass") | ||||
|  | ||||
|         try: | ||||
|             log.explain(f"Calling 'pass show {self._passname}'") | ||||
|             result = subprocess.check_output(["pass", "show", self._passname], text=True) | ||||
|         except subprocess.CalledProcessError as e: | ||||
|             raise AuthError(f"Failed to get password info from {self._passname}: {e}") | ||||
|  | ||||
|         prefixed = {} | ||||
|         unprefixed = [] | ||||
|         for line in result.strip().splitlines(): | ||||
|             if match := re.fullmatch(self.PREFIXED_LINE_RE, line): | ||||
|                 prefix = match.group(1).lower() | ||||
|                 value = match.group(2) | ||||
|                 log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") | ||||
|                 if prefix in prefixed: | ||||
|                     raise AuthError(f"Prefix {prefix} specified multiple times") | ||||
|                 prefixed[prefix] = value | ||||
|             else: | ||||
|                 log.explain(f"Found unprefixed line {line!r}") | ||||
|                 unprefixed.append(line) | ||||
|  | ||||
|         username = None | ||||
|         for prefix in self._username_prefixes: | ||||
|             log.explain(f"Looking for username at prefix {prefix!r}") | ||||
|             if prefix in prefixed: | ||||
|                 username = prefixed[prefix] | ||||
|                 log.explain(f"Found username {username!r}") | ||||
|                 break | ||||
|  | ||||
|         password = None | ||||
|         for prefix in self._password_prefixes: | ||||
|             log.explain(f"Looking for password at prefix {prefix!r}") | ||||
|             if prefix in prefixed: | ||||
|                 password = prefixed[prefix] | ||||
|                 log.explain(f"Found password {password!r}") | ||||
|                 break | ||||
|  | ||||
|         if password is None and username is None: | ||||
|             log.explain("No username and password found so far") | ||||
|             log.explain("Using first unprefixed line as password") | ||||
|             log.explain("Using second unprefixed line as username") | ||||
|         elif password is None: | ||||
|             log.explain("No password found so far") | ||||
|             log.explain("Using first unprefixed line as password") | ||||
|         elif username is None: | ||||
|             log.explain("No username found so far") | ||||
|             log.explain("Using first unprefixed line as username") | ||||
|  | ||||
|         if password is None: | ||||
|             if not unprefixed: | ||||
|                 log.explain("Not enough unprefixed lines left") | ||||
|                 raise AuthError("Password could not be determined") | ||||
|             password = unprefixed.pop(0) | ||||
|             log.explain(f"Found password {password!r}") | ||||
|  | ||||
|         if username is None: | ||||
|             if not unprefixed: | ||||
|                 log.explain("Not enough unprefixed lines left") | ||||
|                 raise AuthError("Username could not be determined") | ||||
|             username = unprefixed.pop(0) | ||||
|             log.explain(f"Found username {username!r}") | ||||
|  | ||||
|         return username, password | ||||
							
								
								
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| from typing import Optional, Tuple | ||||
|  | ||||
| from ..logging import log | ||||
| from ..utils import agetpass, ainput | ||||
| from .authenticator import Authenticator, AuthError, AuthSection | ||||
|  | ||||
|  | ||||
| class SimpleAuthSection(AuthSection): | ||||
|     def username(self) -> Optional[str]: | ||||
|         return self.s.get("username") | ||||
|  | ||||
|     def password(self) -> Optional[str]: | ||||
|         return self.s.get("password") | ||||
|  | ||||
|  | ||||
| class SimpleAuthenticator(Authenticator): | ||||
|     def __init__(self, name: str, section: SimpleAuthSection) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|         self._username = section.username() | ||||
|         self._password = section.password() | ||||
|  | ||||
|         self._username_fixed = self.username is not None | ||||
|         self._password_fixed = self.password is not None | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         if self._username is not None and self._password is not None: | ||||
|             return self._username, self._password | ||||
|  | ||||
|         async with log.exclusive_output(): | ||||
|             if self._username is None: | ||||
|                 self._username = await ainput("Username: ") | ||||
|             else: | ||||
|                 print(f"Username: {self._username}") | ||||
|  | ||||
|             if self._password is None: | ||||
|                 self._password = await agetpass("Password: ") | ||||
|  | ||||
|             # Intentionally returned inside the context manager so we know | ||||
|             # they're both not None | ||||
|             return self._username, self._password | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         if self._username_fixed and self._password_fixed: | ||||
|             raise AuthError("Configured credentials are invalid") | ||||
|  | ||||
|         if not self._username_fixed: | ||||
|             self._username = None | ||||
|         if not self._password_fixed: | ||||
|             self._password = None | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         if self._username_fixed: | ||||
|             raise AuthError("Configured username is invalid") | ||||
|         else: | ||||
|             self._username = None | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         if self._password_fixed: | ||||
|             raise AuthError("Configured password is invalid") | ||||
|         else: | ||||
|             self._password = None | ||||
							
								
								
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| from typing import Tuple | ||||
|  | ||||
| from ..logging import log | ||||
| from ..utils import ainput | ||||
| from .authenticator import Authenticator, AuthError | ||||
|  | ||||
|  | ||||
| class TfaAuthenticator(Authenticator): | ||||
|     def __init__(self, name: str) -> None: | ||||
|         super().__init__(name) | ||||
|  | ||||
|     async def username(self) -> str: | ||||
|         raise AuthError("TFA authenticator does not support usernames") | ||||
|  | ||||
|     async def password(self) -> str: | ||||
|         async with log.exclusive_output(): | ||||
|             code = await ainput("TFA code: ") | ||||
|             return code | ||||
|  | ||||
|     async def credentials(self) -> Tuple[str, str]: | ||||
|         raise AuthError("TFA authenticator does not support usernames") | ||||
|  | ||||
|     def invalidate_username(self) -> None: | ||||
|         raise AuthError("TFA authenticator does not support usernames") | ||||
|  | ||||
|     def invalidate_password(self) -> None: | ||||
|         pass | ||||
|  | ||||
|     def invalidate_credentials(self) -> None: | ||||
|         pass | ||||
							
								
								
									
										13
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| # isort: skip_file | ||||
|  | ||||
| # The order of imports matters because each command module registers itself | ||||
| # with the parser from ".parser" and the import order affects the order in | ||||
| # which they appear in the help. Because of this, isort is disabled for this | ||||
| # file. Also, since we're reexporting or just using the side effect of | ||||
| # importing itself, we get a few linting warnings, which we're disabling as | ||||
| # well. | ||||
|  | ||||
| from . import command_local  # noqa: F401 imported but unused | ||||
| from . import command_kit_ilias_web  # noqa: F401 imported but unused | ||||
| from . import command_kit_ipd  # noqa: F401 imported but unused | ||||
| from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused | ||||
							
								
								
									
										120
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										120
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,120 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..crawl.ilias.file_templates import Links | ||||
| from ..logging import log | ||||
| from .parser import (CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, ParserLoadError, load_crawler, | ||||
|                      show_value_error) | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "kit-ilias-web", | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="kit-ilias-web crawler arguments", | ||||
|     description="arguments for the 'kit-ilias-web' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=str, | ||||
|     metavar="TARGET", | ||||
|     help="course id, 'desktop', or ILIAS URL to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--username", "-u", | ||||
|     type=str, | ||||
|     metavar="USERNAME", | ||||
|     help="user name for authentication" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--keyring", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="use the system keyring to store and retrieve passwords" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--credential-file", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="read username and password from a credential file" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--links", | ||||
|     type=show_value_error(Links.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="how to represent external links" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--link-redirect-delay", | ||||
|     type=int, | ||||
|     metavar="SECONDS", | ||||
|     help="time before 'fancy' links redirect to to their target (-1 to disable)" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--videos", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="crawl and download videos" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--forums", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="crawl and download forum posts" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--http-timeout", "-t", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="timeout for all HTTP requests" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'kit-ilias-web'") | ||||
|  | ||||
|     parser["crawl:ilias"] = {} | ||||
|     section = parser["crawl:ilias"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "kit-ilias-web" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     section["auth"] = "auth:ilias" | ||||
|     if args.links is not None: | ||||
|         section["links"] = str(args.links.value) | ||||
|     if args.link_redirect_delay is not None: | ||||
|         section["link_redirect_delay"] = str(args.link_redirect_delay) | ||||
|     if args.videos is not None: | ||||
|         section["videos"] = "yes" if args.videos else "no" | ||||
|     if args.forums is not None: | ||||
|         section["forums"] = "yes" if args.forums else "no" | ||||
|     if args.http_timeout is not None: | ||||
|         section["http_timeout"] = str(args.http_timeout) | ||||
|  | ||||
|     parser["auth:ilias"] = {} | ||||
|     auth_section = parser["auth:ilias"] | ||||
|     if args.credential_file is not None: | ||||
|         if args.username is not None: | ||||
|             raise ParserLoadError("--credential-file and --username can't be used together") | ||||
|         if args.keyring: | ||||
|             raise ParserLoadError("--credential-file and --keyring can't be used together") | ||||
|         auth_section["type"] = "credential-file" | ||||
|         auth_section["path"] = str(args.credential_file) | ||||
|     elif args.keyring: | ||||
|         auth_section["type"] = "keyring" | ||||
|     else: | ||||
|         auth_section["type"] = "simple" | ||||
|     if args.username is not None: | ||||
|         auth_section["username"] = args.username | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
							
								
								
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..logging import log | ||||
| from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "kit-ipd", | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="kit ipd crawler arguments", | ||||
|     description="arguments for the 'kit-ipd' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--link-regex", | ||||
|     type=str, | ||||
|     metavar="REGEX", | ||||
|     help="href-matching regex to identify downloadable files" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=str, | ||||
|     metavar="TARGET", | ||||
|     help="url to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'kit-ipd'") | ||||
|  | ||||
|     parser["crawl:kit-ipd"] = {} | ||||
|     section = parser["crawl:kit-ipd"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "kit-ipd" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     if args.link_regex: | ||||
|         section["link_regex"] = str(args.link_regex) | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
							
								
								
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..logging import log | ||||
| from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "local", | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="local crawler arguments", | ||||
|     description="arguments for the 'local' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=Path, | ||||
|     metavar="TARGET", | ||||
|     help="directory to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--crawl-delay", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="artificial delay to simulate for crawl requests" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--download-delay", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="artificial delay to simulate for download requests" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--download-speed", | ||||
|     type=int, | ||||
|     metavar="BYTES_PER_SECOND", | ||||
|     help="download speed to simulate" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'local'") | ||||
|  | ||||
|     parser["crawl:local"] = {} | ||||
|     section = parser["crawl:local"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "local" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     if args.crawl_delay is not None: | ||||
|         section["crawl_delay"] = str(args.crawl_delay) | ||||
|     if args.download_delay is not None: | ||||
|         section["download_delay"] = str(args.download_delay) | ||||
|     if args.download_speed is not None: | ||||
|         section["download_speed"] = str(args.download_speed) | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
							
								
								
									
										243
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										243
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,243 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from argparse import ArgumentTypeError | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, List, Optional, Sequence, Union | ||||
|  | ||||
| from ..output_dir import OnConflict, Redownload | ||||
| from ..version import NAME, VERSION | ||||
|  | ||||
|  | ||||
| class ParserLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| # TODO Replace with argparse version when updating to 3.9? | ||||
| class BooleanOptionalAction(argparse.Action): | ||||
|     def __init__( | ||||
|             self, | ||||
|             option_strings: List[str], | ||||
|             dest: Any, | ||||
|             default: Any = None, | ||||
|             type: Any = None, | ||||
|             choices: Any = None, | ||||
|             required: Any = False, | ||||
|             help: Any = None, | ||||
|             metavar: Any = None, | ||||
|     ): | ||||
|         if len(option_strings) != 1: | ||||
|             raise ValueError("There must be exactly one option string") | ||||
|         [self.name] = option_strings | ||||
|         if not self.name.startswith("--"): | ||||
|             raise ValueError(f"{self.name!r} doesn't start with '--'") | ||||
|         if self.name.startswith("--no-"): | ||||
|             raise ValueError(f"{self.name!r} starts with '--no-'") | ||||
|  | ||||
|         options = [self.name, "--no-" + self.name[2:]] | ||||
|  | ||||
|         super().__init__( | ||||
|             options, | ||||
|             dest, | ||||
|             nargs=0, | ||||
|             default=default, | ||||
|             type=type, | ||||
|             choices=choices, | ||||
|             required=required, | ||||
|             help=help, | ||||
|             metavar=metavar, | ||||
|         ) | ||||
|  | ||||
|     def __call__( | ||||
|             self, | ||||
|             parser: argparse.ArgumentParser, | ||||
|             namespace: argparse.Namespace, | ||||
|             values: Union[str, Sequence[Any], None], | ||||
|             option_string: Optional[str] = None, | ||||
|     ) -> None: | ||||
|         if option_string and option_string in self.option_strings: | ||||
|             value = not option_string.startswith("--no-") | ||||
|             setattr(namespace, self.dest, value) | ||||
|  | ||||
|     def format_usage(self) -> str: | ||||
|         return "--[no-]" + self.name[2:] | ||||
|  | ||||
|  | ||||
| def show_value_error(inner: Callable[[str], Any]) -> Callable[[str], Any]: | ||||
|     """ | ||||
|     Some validation functions (like the from_string in our enums) raise a ValueError. | ||||
|     Argparse only pretty-prints ArgumentTypeErrors though, so we need to wrap our ValueErrors. | ||||
|     """ | ||||
|     def wrapper(input: str) -> Any: | ||||
|         try: | ||||
|             return inner(input) | ||||
|         except ValueError as e: | ||||
|             raise ArgumentTypeError(e) | ||||
|     return wrapper | ||||
|  | ||||
|  | ||||
| CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) | ||||
| CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( | ||||
|     title="general crawler arguments", | ||||
|     description="arguments common to all crawlers", | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--redownload", "-r", | ||||
|     type=show_value_error(Redownload.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="when to download a file that's already present locally" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--on-conflict", | ||||
|     type=show_value_error(OnConflict.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="what to do when local and remote files or directories differ" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--transform", "-T", | ||||
|     action="append", | ||||
|     type=str, | ||||
|     metavar="RULE", | ||||
|     help="add a single transformation rule. Can be specified multiple times" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--tasks", "-n", | ||||
|     type=int, | ||||
|     metavar="N", | ||||
|     help="maximum number of concurrent tasks (crawling, downloading)" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--downloads", "-N", | ||||
|     type=int, | ||||
|     metavar="N", | ||||
|     help="maximum number of tasks that may download data at the same time" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--task-delay", "-d", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="time the crawler should wait between subsequent tasks" | ||||
| ) | ||||
| CRAWLER_PARSER_GROUP.add_argument( | ||||
|     "--windows-paths", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="whether to repair invalid paths on windows" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load_crawler( | ||||
|         args: argparse.Namespace, | ||||
|         section: configparser.SectionProxy, | ||||
| ) -> None: | ||||
|     if args.redownload is not None: | ||||
|         section["redownload"] = args.redownload.value | ||||
|     if args.on_conflict is not None: | ||||
|         section["on_conflict"] = args.on_conflict.value | ||||
|     if args.transform is not None: | ||||
|         section["transform"] = "\n" + "\n".join(args.transform) | ||||
|     if args.tasks is not None: | ||||
|         section["tasks"] = str(args.tasks) | ||||
|     if args.downloads is not None: | ||||
|         section["downloads"] = str(args.downloads) | ||||
|     if args.task_delay is not None: | ||||
|         section["task_delay"] = str(args.task_delay) | ||||
|     if args.windows_paths is not None: | ||||
|         section["windows_paths"] = "yes" if args.windows_paths else "no" | ||||
|  | ||||
|  | ||||
| PARSER = argparse.ArgumentParser() | ||||
| PARSER.set_defaults(command=None) | ||||
| PARSER.add_argument( | ||||
|     "--version", | ||||
|     action="version", | ||||
|     version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--skip-update-check", | ||||
|     action="store_true", | ||||
|     help="disable automatic update checks at startup" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--config", "-c", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="custom config file" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--dump-config", | ||||
|     action="store_true", | ||||
|     help="dump current configuration to the default config path and exit" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--dump-config-to", | ||||
|     metavar="PATH", | ||||
|     help="dump current configuration to a file and exit." | ||||
|     " Use '-' as path to print to stdout instead" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--debug-transforms", | ||||
|     action="store_true", | ||||
|     help="apply transform rules to files of previous run" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--crawler", "-C", | ||||
|     action="append", | ||||
|     type=str, | ||||
|     metavar="NAME", | ||||
|     help="only execute a single crawler." | ||||
|     " Can be specified multiple times to execute multiple crawlers" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--skip", "-S", | ||||
|     action="append", | ||||
|     type=str, | ||||
|     metavar="NAME", | ||||
|     help="don't execute this particular crawler." | ||||
|     " Can be specified multiple times to skip multiple crawlers" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--working-dir", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="custom working directory" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--explain", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="log and explain in detail what PFERD is doing" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--status", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="print status updates while PFERD is crawling" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--report", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="print a report of all local changes before exiting" | ||||
| ) | ||||
| PARSER.add_argument( | ||||
|     "--share-cookies", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="whether crawlers should share cookies where applicable" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def load_default_section( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     section = parser[parser.default_section] | ||||
|  | ||||
|     if args.working_dir is not None: | ||||
|         section["working_dir"] = str(args.working_dir) | ||||
|     if args.explain is not None: | ||||
|         section["explain"] = "yes" if args.explain else "no" | ||||
|     if args.status is not None: | ||||
|         section["status"] = "yes" if args.status else "no" | ||||
|     if args.report is not None: | ||||
|         section["report"] = "yes" if args.report else "no" | ||||
|     if args.share_cookies is not None: | ||||
|         section["share_cookies"] = "yes" if args.share_cookies else "no" | ||||
|  | ||||
|  | ||||
| SUBPARSERS = PARSER.add_subparsers(title="crawlers") | ||||
							
								
								
									
										190
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										190
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,190 @@ | ||||
| import asyncio | ||||
| import os | ||||
| import sys | ||||
| from configparser import ConfigParser, SectionProxy | ||||
| from pathlib import Path | ||||
| from typing import Any, List, NoReturn, Optional, Tuple | ||||
|  | ||||
| from rich.markup import escape | ||||
|  | ||||
| from .logging import log | ||||
| from .utils import fmt_real_path, prompt_yes_no | ||||
|  | ||||
|  | ||||
| class ConfigLoadError(Exception): | ||||
|     """ | ||||
|     Something went wrong while loading the config from a file. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: Path, reason: str): | ||||
|         super().__init__(f"Failed to load config from {fmt_real_path(path)}") | ||||
|         self.path = path | ||||
|         self.reason = reason | ||||
|  | ||||
|  | ||||
| class ConfigOptionError(Exception): | ||||
|     """ | ||||
|     An option in the config file has an invalid or missing value. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, section: str, key: str, desc: str): | ||||
|         super().__init__(f"Section {section!r}, key {key!r}: {desc}") | ||||
|         self.section = section | ||||
|         self.key = key | ||||
|         self.desc = desc | ||||
|  | ||||
|  | ||||
| class ConfigDumpError(Exception): | ||||
|     def __init__(self, path: Path, reason: str): | ||||
|         super().__init__(f"Failed to dump config to {fmt_real_path(path)}") | ||||
|         self.path = path | ||||
|         self.reason = reason | ||||
|  | ||||
|  | ||||
| class Section: | ||||
|     """ | ||||
|     Base class for the crawler and auth section classes. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, section: SectionProxy): | ||||
|         self.s = section | ||||
|  | ||||
|     def error(self, key: str, desc: str) -> NoReturn: | ||||
|         raise ConfigOptionError(self.s.name, key, desc) | ||||
|  | ||||
|     def invalid_value( | ||||
|             self, | ||||
|             key: str, | ||||
|             value: Any, | ||||
|             reason: Optional[str], | ||||
|     ) -> NoReturn: | ||||
|         if reason is None: | ||||
|             self.error(key, f"Invalid value {value!r}") | ||||
|         else: | ||||
|             self.error(key, f"Invalid value {value!r}: {reason}") | ||||
|  | ||||
|     def missing_value(self, key: str) -> NoReturn: | ||||
|         self.error(key, "Missing value") | ||||
|  | ||||
|  | ||||
| class DefaultSection(Section): | ||||
|     def working_dir(self) -> Path: | ||||
|         # TODO Change to working dir instead of manually prepending it to paths | ||||
|         pathstr = self.s.get("working_dir", ".") | ||||
|         return Path(pathstr).expanduser() | ||||
|  | ||||
|     def explain(self) -> bool: | ||||
|         return self.s.getboolean("explain", fallback=False) | ||||
|  | ||||
|     def status(self) -> bool: | ||||
|         return self.s.getboolean("status", fallback=True) | ||||
|  | ||||
|     def report(self) -> bool: | ||||
|         return self.s.getboolean("report", fallback=True) | ||||
|  | ||||
|     def share_cookies(self) -> bool: | ||||
|         return self.s.getboolean("share_cookies", fallback=True) | ||||
|  | ||||
|  | ||||
| class Config: | ||||
|     @staticmethod | ||||
|     def _default_path() -> Path: | ||||
|         if os.name == "posix": | ||||
|             return Path("~/.config/PFERD/pferd.cfg").expanduser() | ||||
|         elif os.name == "nt": | ||||
|             return Path("~/AppData/Roaming/PFERD/pferd.cfg").expanduser() | ||||
|         else: | ||||
|             return Path("~/.pferd.cfg").expanduser() | ||||
|  | ||||
|     def __init__(self, parser: ConfigParser): | ||||
|         self._parser = parser | ||||
|         self._default_section = DefaultSection(parser[parser.default_section]) | ||||
|  | ||||
|     @property | ||||
|     def default_section(self) -> DefaultSection: | ||||
|         return self._default_section | ||||
|  | ||||
|     @staticmethod | ||||
|     def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: | ||||
|         """ | ||||
|         May throw a ConfigLoadError. | ||||
|         """ | ||||
|  | ||||
|         if path: | ||||
|             log.explain("Path specified on CLI") | ||||
|         else: | ||||
|             log.explain("Using default path") | ||||
|             path = Config._default_path() | ||||
|         log.explain(f"Loading {fmt_real_path(path)}") | ||||
|  | ||||
|         # Using config.read_file instead of config.read because config.read | ||||
|         # would just ignore a missing file and carry on. | ||||
|         try: | ||||
|             with open(path, encoding="utf-8") as f: | ||||
|                 parser.read_file(f, source=str(path)) | ||||
|         except FileNotFoundError: | ||||
|             raise ConfigLoadError(path, "File does not exist") | ||||
|         except IsADirectoryError: | ||||
|             raise ConfigLoadError(path, "That's a directory, not a file") | ||||
|         except PermissionError: | ||||
|             raise ConfigLoadError(path, "Insufficient permissions") | ||||
|         except UnicodeDecodeError: | ||||
|             raise ConfigLoadError(path, "File is not encoded using UTF-8") | ||||
|  | ||||
|     def dump(self, path: Optional[Path] = None) -> None: | ||||
|         """ | ||||
|         May throw a ConfigDumpError. | ||||
|         """ | ||||
|  | ||||
|         if path: | ||||
|             log.explain("Using custom path") | ||||
|         else: | ||||
|             log.explain("Using default path") | ||||
|             path = self._default_path() | ||||
|  | ||||
|         log.explain(f"Dumping to {fmt_real_path(path)}") | ||||
|         log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}") | ||||
|  | ||||
|         try: | ||||
|             path.parent.mkdir(parents=True, exist_ok=True) | ||||
|         except PermissionError: | ||||
|             raise ConfigDumpError(path, "Could not create parent directory") | ||||
|  | ||||
|         try: | ||||
|             # Ensuring we don't accidentally overwrite any existing files by | ||||
|             # always asking before overwriting a file. | ||||
|             try: | ||||
|                 # x = open for exclusive creation, failing if the file already | ||||
|                 # exists | ||||
|                 with open(path, "x", encoding="utf-8") as f: | ||||
|                     self._parser.write(f) | ||||
|             except FileExistsError: | ||||
|                 print("That file already exists.") | ||||
|                 if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): | ||||
|                     with open(path, "w", encoding="utf-8") as f: | ||||
|                         self._parser.write(f) | ||||
|                 else: | ||||
|                     raise ConfigDumpError(path, "File already exists") | ||||
|         except IsADirectoryError: | ||||
|             raise ConfigDumpError(path, "That's a directory, not a file") | ||||
|         except PermissionError: | ||||
|             raise ConfigDumpError(path, "Insufficient permissions") | ||||
|  | ||||
|     def dump_to_stdout(self) -> None: | ||||
|         self._parser.write(sys.stdout) | ||||
|  | ||||
|     def crawl_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||
|         result = [] | ||||
|         for name, proxy in self._parser.items(): | ||||
|             if name.startswith("crawl:"): | ||||
|                 result.append((name, proxy)) | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def auth_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||
|         result = [] | ||||
|         for name, proxy in self._parser.items(): | ||||
|             if name.startswith("auth:"): | ||||
|                 result.append((name, proxy)) | ||||
|  | ||||
|         return result | ||||
							
								
								
									
										25
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| from configparser import SectionProxy | ||||
| from typing import Callable, Dict | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config | ||||
| from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401 | ||||
| from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
| from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection | ||||
| from .local_crawler import LocalCrawler, LocalCrawlerSection | ||||
|  | ||||
| CrawlerConstructor = Callable[[ | ||||
|     str,                       # Name (without the "crawl:" prefix) | ||||
|     SectionProxy,              # Crawler's section of global config | ||||
|     Config,                    # Global config | ||||
|     Dict[str, Authenticator],  # Loaded authenticators by name | ||||
| ], Crawler] | ||||
|  | ||||
| CRAWLERS: Dict[str, CrawlerConstructor] = { | ||||
|     "local": lambda n, s, c, a: | ||||
|         LocalCrawler(n, LocalCrawlerSection(s), c), | ||||
|     "kit-ilias-web": lambda n, s, c, a: | ||||
|         KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), | ||||
|     "kit-ipd": lambda n, s, c, a: | ||||
|         KitIpdCrawler(n, KitIpdCrawlerSection(s), c), | ||||
| } | ||||
							
								
								
									
										369
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										369
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,369 @@ | ||||
| import asyncio | ||||
| import os | ||||
| from abc import ABC, abstractmethod | ||||
| from collections.abc import Awaitable, Coroutine | ||||
| from datetime import datetime | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config, Section | ||||
| from ..deduplicator import Deduplicator | ||||
| from ..limiter import Limiter | ||||
| from ..logging import ProgressBar, log | ||||
| from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload | ||||
| from ..report import MarkConflictError, MarkDuplicateError, Report | ||||
| from ..transformer import Transformer | ||||
| from ..utils import ReusableAsyncContextManager, fmt_path | ||||
|  | ||||
|  | ||||
| class CrawlWarning(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class CrawlError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) | ||||
|  | ||||
|  | ||||
| def noncritical(f: Wrapped) -> Wrapped: | ||||
|     """ | ||||
|     Catches and logs a few noncritical exceptions occurring during the function | ||||
|     call, mainly CrawlWarning. | ||||
|  | ||||
|     If any exception occurs during the function call, the crawler's error_free | ||||
|     variable is set to False. This includes noncritical exceptions. | ||||
|  | ||||
|     Warning: Must only be applied to member functions of the Crawler class! | ||||
|     """ | ||||
|  | ||||
|     def wrapper(*args: Any, **kwargs: Any) -> None: | ||||
|         if not (args and isinstance(args[0], Crawler)): | ||||
|             raise RuntimeError("@noncritical must only applied to Crawler methods") | ||||
|  | ||||
|         crawler = args[0] | ||||
|  | ||||
|         try: | ||||
|             f(*args, **kwargs) | ||||
|         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||
|             crawler.report.add_warning(str(e)) | ||||
|             log.warn(str(e)) | ||||
|             crawler.error_free = False | ||||
|         except Exception as e: | ||||
|             crawler.error_free = False | ||||
|             crawler.report.add_error(str(e)) | ||||
|             raise | ||||
|  | ||||
|     return wrapper  # type: ignore | ||||
|  | ||||
|  | ||||
| AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]]) | ||||
|  | ||||
|  | ||||
| def anoncritical(f: AWrapped) -> AWrapped: | ||||
|     """ | ||||
|     An async version of @noncritical. | ||||
|  | ||||
|     Catches and logs a few noncritical exceptions occurring during the function | ||||
|     call, mainly CrawlWarning. | ||||
|  | ||||
|     If any exception occurs during the function call, the crawler's error_free | ||||
|     variable is set to False. This includes noncritical exceptions. | ||||
|  | ||||
|     Warning: Must only be applied to member functions of the Crawler class! | ||||
|     """ | ||||
|  | ||||
|     async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||
|         if not (args and isinstance(args[0], Crawler)): | ||||
|             raise RuntimeError("@anoncritical must only applied to Crawler methods") | ||||
|  | ||||
|         crawler = args[0] | ||||
|  | ||||
|         try: | ||||
|             return await f(*args, **kwargs) | ||||
|         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||
|             log.warn(str(e)) | ||||
|             crawler.error_free = False | ||||
|             crawler.report.add_warning(str(e)) | ||||
|         except Exception as e: | ||||
|             crawler.error_free = False | ||||
|             crawler.report.add_error(str(e)) | ||||
|             raise | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     return wrapper  # type: ignore | ||||
|  | ||||
|  | ||||
| class CrawlToken(ReusableAsyncContextManager[ProgressBar]): | ||||
|     def __init__(self, limiter: Limiter, path: PurePath): | ||||
|         super().__init__() | ||||
|  | ||||
|         self._limiter = limiter | ||||
|         self._path = path | ||||
|  | ||||
|     @property | ||||
|     def path(self) -> PurePath: | ||||
|         return self._path | ||||
|  | ||||
|     async def _on_aenter(self) -> ProgressBar: | ||||
|         self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path))) | ||||
|         await self._stack.enter_async_context(self._limiter.limit_crawl()) | ||||
|         bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path))) | ||||
|  | ||||
|         return bar | ||||
|  | ||||
|  | ||||
| class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): | ||||
|     def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath): | ||||
|         super().__init__() | ||||
|  | ||||
|         self._limiter = limiter | ||||
|         self._fs_token = fs_token | ||||
|         self._path = path | ||||
|  | ||||
|     @property | ||||
|     def path(self) -> PurePath: | ||||
|         return self._path | ||||
|  | ||||
|     async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: | ||||
|         await self._stack.enter_async_context(self._limiter.limit_download()) | ||||
|         sink = await self._stack.enter_async_context(self._fs_token) | ||||
|         # The "Downloaded ..." message is printed in the output dir, not here | ||||
|         bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading", | ||||
|                                                          fmt_path(self._path))) | ||||
|  | ||||
|         return bar, sink | ||||
|  | ||||
|  | ||||
| class CrawlerSection(Section): | ||||
|     def type(self) -> str: | ||||
|         value = self.s.get("type") | ||||
|         if value is None: | ||||
|             self.missing_value("type") | ||||
|         return value | ||||
|  | ||||
|     def skip(self) -> bool: | ||||
|         return self.s.getboolean("skip", fallback=False) | ||||
|  | ||||
|     def output_dir(self, name: str) -> Path: | ||||
|         # TODO Use removeprefix() after switching to 3.9 | ||||
|         if name.startswith("crawl:"): | ||||
|             name = name[len("crawl:"):] | ||||
|         return Path(self.s.get("output_dir", name)).expanduser() | ||||
|  | ||||
|     def redownload(self) -> Redownload: | ||||
|         value = self.s.get("redownload", "never-smart") | ||||
|         try: | ||||
|             return Redownload.from_string(value) | ||||
|         except ValueError as e: | ||||
|             self.invalid_value( | ||||
|                 "redownload", | ||||
|                 value, | ||||
|                 str(e).capitalize(), | ||||
|             ) | ||||
|  | ||||
|     def on_conflict(self) -> OnConflict: | ||||
|         value = self.s.get("on_conflict", "prompt") | ||||
|         try: | ||||
|             return OnConflict.from_string(value) | ||||
|         except ValueError as e: | ||||
|             self.invalid_value( | ||||
|                 "on_conflict", | ||||
|                 value, | ||||
|                 str(e).capitalize(), | ||||
|             ) | ||||
|  | ||||
|     def transform(self) -> str: | ||||
|         return self.s.get("transform", "") | ||||
|  | ||||
|     def tasks(self) -> int: | ||||
|         value = self.s.getint("tasks", fallback=1) | ||||
|         if value <= 0: | ||||
|             self.invalid_value("tasks", value, "Must be greater than 0") | ||||
|         return value | ||||
|  | ||||
|     def downloads(self) -> int: | ||||
|         tasks = self.tasks() | ||||
|         value = self.s.getint("downloads", fallback=None) | ||||
|         if value is None: | ||||
|             return tasks | ||||
|         if value <= 0: | ||||
|             self.invalid_value("downloads", value, "Must be greater than 0") | ||||
|         if value > tasks: | ||||
|             self.invalid_value("downloads", value, "Must not be greater than tasks") | ||||
|         return value | ||||
|  | ||||
|     def task_delay(self) -> float: | ||||
|         value = self.s.getfloat("task_delay", fallback=0.0) | ||||
|         if value < 0: | ||||
|             self.invalid_value("task_delay", value, "Must not be negative") | ||||
|         return value | ||||
|  | ||||
|     def windows_paths(self) -> bool: | ||||
|         on_windows = os.name == "nt" | ||||
|         return self.s.getboolean("windows_paths", fallback=on_windows) | ||||
|  | ||||
|     def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: | ||||
|         value = self.s.get("auth") | ||||
|         if value is None: | ||||
|             self.missing_value("auth") | ||||
|         auth = authenticators.get(value) | ||||
|         if auth is None: | ||||
|             self.invalid_value("auth", value, "No such auth section exists") | ||||
|         return auth | ||||
|  | ||||
|  | ||||
| class Crawler(ABC): | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: CrawlerSection, | ||||
|             config: Config, | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Initialize a crawler from its name and its section in the config file. | ||||
|  | ||||
|         If you are writing your own constructor for your own crawler, make sure | ||||
|         to call this constructor first (via super().__init__). | ||||
|  | ||||
|         May throw a CrawlerLoadException. | ||||
|         """ | ||||
|  | ||||
|         self.name = name | ||||
|         self.error_free = True | ||||
|  | ||||
|         self._limiter = Limiter( | ||||
|             task_limit=section.tasks(), | ||||
|             download_limit=section.downloads(), | ||||
|             task_delay=section.task_delay(), | ||||
|         ) | ||||
|  | ||||
|         self._deduplicator = Deduplicator(section.windows_paths()) | ||||
|         self._transformer = Transformer(section.transform()) | ||||
|  | ||||
|         self._output_dir = OutputDirectory( | ||||
|             config.default_section.working_dir() / section.output_dir(name), | ||||
|             section.redownload(), | ||||
|             section.on_conflict(), | ||||
|         ) | ||||
|  | ||||
|     @property | ||||
|     def report(self) -> Report: | ||||
|         return self._output_dir.report | ||||
|  | ||||
|     @property | ||||
|     def prev_report(self) -> Optional[Report]: | ||||
|         return self._output_dir.prev_report | ||||
|  | ||||
|     @staticmethod | ||||
|     async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: | ||||
|         """ | ||||
|         Similar to asyncio.gather. However, in the case of an exception, all | ||||
|         still running tasks are cancelled and the exception is rethrown. | ||||
|  | ||||
|         This should always be preferred over asyncio.gather in crawler code so | ||||
|         that an exception like CrawlError may actually stop the crawler. | ||||
|         """ | ||||
|  | ||||
|         tasks = [asyncio.ensure_future(aw) for aw in awaitables] | ||||
|         result = asyncio.gather(*tasks) | ||||
|         try: | ||||
|             return await result | ||||
|         except:  # noqa: E722 | ||||
|             for task in tasks: | ||||
|                 task.cancel() | ||||
|             raise | ||||
|  | ||||
|     async def crawl(self, path: PurePath) -> Optional[CrawlToken]: | ||||
|         log.explain_topic(f"Decision: Crawl {fmt_path(path)}") | ||||
|         path = self._deduplicator.mark(path) | ||||
|         self._output_dir.report.found(path) | ||||
|  | ||||
|         if self._transformer.transform(path) is None: | ||||
|             log.explain("Answer: No") | ||||
|             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||
|             return None | ||||
|  | ||||
|         log.explain("Answer: Yes") | ||||
|         return CrawlToken(self._limiter, path) | ||||
|  | ||||
|     async def download( | ||||
|             self, | ||||
|             path: PurePath, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
|     ) -> Optional[DownloadToken]: | ||||
|         log.explain_topic(f"Decision: Download {fmt_path(path)}") | ||||
|         path = self._deduplicator.mark(path) | ||||
|         self._output_dir.report.found(path) | ||||
|  | ||||
|         transformed_path = self._transformer.transform(path) | ||||
|         if transformed_path is None: | ||||
|             log.explain("Answer: No") | ||||
|             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||
|             return None | ||||
|  | ||||
|         fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) | ||||
|         if fs_token is None: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|  | ||||
|         log.explain("Answer: Yes") | ||||
|         return DownloadToken(self._limiter, fs_token, path) | ||||
|  | ||||
|     async def _cleanup(self) -> None: | ||||
|         log.explain_topic("Decision: Clean up files") | ||||
|         if self.error_free: | ||||
|             log.explain("No warnings or errors occurred during this run") | ||||
|             log.explain("Answer: Yes") | ||||
|             await self._output_dir.cleanup() | ||||
|         else: | ||||
|             log.explain("Warnings or errors occurred during this run") | ||||
|             log.explain("Answer: No") | ||||
|  | ||||
|     @anoncritical | ||||
|     async def run(self) -> None: | ||||
|         """ | ||||
|         Start the crawling process. Call this function if you want to use a | ||||
|         crawler. | ||||
|         """ | ||||
|  | ||||
|         with log.show_progress(): | ||||
|             self._output_dir.prepare() | ||||
|             self._output_dir.load_prev_report() | ||||
|             await self._run() | ||||
|             await self._cleanup() | ||||
|             self._output_dir.store_report() | ||||
|  | ||||
|     @abstractmethod | ||||
|     async def _run(self) -> None: | ||||
|         """ | ||||
|         Overwrite this function if you are writing a crawler. | ||||
|  | ||||
|         This function must not return before all crawling is complete. To crawl | ||||
|         multiple things concurrently, asyncio.gather can be used. | ||||
|         """ | ||||
|  | ||||
|         pass | ||||
|  | ||||
|     def debug_transforms(self) -> None: | ||||
|         self._output_dir.load_prev_report() | ||||
|  | ||||
|         if not self.prev_report: | ||||
|             log.warn("Couldn't find or load old report") | ||||
|             return | ||||
|  | ||||
|         seen: Set[PurePath] = set() | ||||
|         for known in sorted(self.prev_report.found_paths): | ||||
|             looking_at = list(reversed(known.parents)) + [known] | ||||
|             for path in looking_at: | ||||
|                 if path in seen: | ||||
|                     continue | ||||
|  | ||||
|                 log.explain_topic(f"Transforming {fmt_path(path)}") | ||||
|                 self._transformer.transform(path) | ||||
|                 seen.add(path) | ||||
							
								
								
									
										199
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										199
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,199 @@ | ||||
| import asyncio | ||||
| import http.cookies | ||||
| import ssl | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Dict, List, Optional | ||||
|  | ||||
| import aiohttp | ||||
| import certifi | ||||
| from aiohttp.client import ClientTimeout | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config | ||||
| from ..logging import log | ||||
| from ..utils import fmt_real_path | ||||
| from ..version import NAME, VERSION | ||||
| from .crawler import Crawler, CrawlerSection | ||||
|  | ||||
|  | ||||
| class HttpCrawlerSection(CrawlerSection): | ||||
|     def http_timeout(self) -> float: | ||||
|         return self.s.getfloat("http_timeout", fallback=20) | ||||
|  | ||||
|  | ||||
| class HttpCrawler(Crawler): | ||||
|     COOKIE_FILE = PurePath(".cookies") | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: HttpCrawlerSection, | ||||
|             config: Config, | ||||
|             shared_auth: Optional[Authenticator] = None, | ||||
|     ) -> None: | ||||
|         super().__init__(name, section, config) | ||||
|  | ||||
|         self._authentication_id = 0 | ||||
|         self._authentication_lock = asyncio.Lock() | ||||
|         self._request_count = 0 | ||||
|         self._http_timeout = section.http_timeout() | ||||
|  | ||||
|         self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) | ||||
|         self._shared_cookie_jar_paths: Optional[List[Path]] = None | ||||
|         self._shared_auth = shared_auth | ||||
|  | ||||
|         self._output_dir.register_reserved(self.COOKIE_FILE) | ||||
|  | ||||
|     async def _current_auth_id(self) -> int: | ||||
|         """ | ||||
|         Returns the id for the current authentication, i.e. an identifier for the last | ||||
|         successful call to [authenticate]. | ||||
|  | ||||
|         This method must be called before any request that might authenticate is made, so the | ||||
|         HttpCrawler can properly track when [authenticate] can return early and when actual | ||||
|         authentication is necessary. | ||||
|         """ | ||||
|         # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. | ||||
|         # This should reduce the amount of requests we make: If an authentication is in progress | ||||
|         # all future requests wait for authentication to complete. | ||||
|         async with self._authentication_lock: | ||||
|             self._request_count += 1 | ||||
|             return self._authentication_id | ||||
|  | ||||
|     async def authenticate(self, caller_auth_id: int) -> None: | ||||
|         """ | ||||
|         Starts the authentication process. The main work is offloaded to _authenticate, which | ||||
|         you should overwrite in a subclass if needed. This method should *NOT* be overwritten. | ||||
|  | ||||
|         The [caller_auth_id] should be the result of a [_current_auth_id] call made *before* | ||||
|         the request was made. This ensures that authentication is not performed needlessly. | ||||
|         """ | ||||
|         async with self._authentication_lock: | ||||
|             log.explain_topic("Authenticating") | ||||
|             # Another thread successfully called authenticate in-between | ||||
|             # We do not want to perform auth again, so we return here. We can | ||||
|             # assume the other thread suceeded as authenticate will throw an error | ||||
|             # if it failed and aborts the crawl process. | ||||
|             if caller_auth_id != self._authentication_id: | ||||
|                 log.explain( | ||||
|                     "Authentication skipped due to auth id mismatch." | ||||
|                     "A previous authentication beat us to the race." | ||||
|                 ) | ||||
|                 return | ||||
|             log.explain("Calling crawler-specific authenticate") | ||||
|             await self._authenticate() | ||||
|             self._authentication_id += 1 | ||||
|             # Saving the cookies after the first auth ensures we won't need to re-authenticate | ||||
|             # on the next run, should this one be aborted or crash | ||||
|             self._save_cookies() | ||||
|  | ||||
|     async def _authenticate(self) -> None: | ||||
|         """ | ||||
|         Performs authentication. This method must only return normally if authentication suceeded. | ||||
|         In all other cases it must either retry internally or throw a terminal exception. | ||||
|         """ | ||||
|         raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") | ||||
|  | ||||
|     def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None: | ||||
|         if not self._shared_auth: | ||||
|             return | ||||
|  | ||||
|         if self._shared_auth in shared: | ||||
|             self._shared_cookie_jar_paths = shared[self._shared_auth] | ||||
|         else: | ||||
|             self._shared_cookie_jar_paths = [] | ||||
|             shared[self._shared_auth] = self._shared_cookie_jar_paths | ||||
|  | ||||
|         self._shared_cookie_jar_paths.append(self._cookie_jar_path) | ||||
|  | ||||
|     def _load_cookies_from_file(self, path: Path) -> None: | ||||
|         jar: Any = http.cookies.SimpleCookie() | ||||
|         with open(path, encoding="utf-8") as f: | ||||
|             for i, line in enumerate(f): | ||||
|                 # Names of headers are case insensitive | ||||
|                 if line[:11].lower() == "set-cookie:": | ||||
|                     jar.load(line[11:]) | ||||
|                 else: | ||||
|                     log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it") | ||||
|         self._cookie_jar.update_cookies(jar) | ||||
|  | ||||
|     def _save_cookies_to_file(self, path: Path) -> None: | ||||
|         jar: Any = http.cookies.SimpleCookie() | ||||
|         for morsel in self._cookie_jar: | ||||
|             jar[morsel.key] = morsel | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             f.write(jar.output(sep="\n")) | ||||
|             f.write("\n")  # A trailing newline is just common courtesy | ||||
|  | ||||
|     def _load_cookies(self) -> None: | ||||
|         log.explain_topic("Loading cookies") | ||||
|  | ||||
|         cookie_jar_path: Optional[Path] = None | ||||
|  | ||||
|         if self._shared_cookie_jar_paths is None: | ||||
|             log.explain("Not sharing any cookies") | ||||
|             cookie_jar_path = self._cookie_jar_path | ||||
|         else: | ||||
|             log.explain("Sharing cookies") | ||||
|             max_mtime: Optional[float] = None | ||||
|             for path in self._shared_cookie_jar_paths: | ||||
|                 if not path.is_file(): | ||||
|                     log.explain(f"{fmt_real_path(path)} is not a file") | ||||
|                     continue | ||||
|                 mtime = path.stat().st_mtime | ||||
|                 if max_mtime is None or mtime > max_mtime: | ||||
|                     log.explain(f"{fmt_real_path(path)} has newest mtime so far") | ||||
|                     max_mtime = mtime | ||||
|                     cookie_jar_path = path | ||||
|                 else: | ||||
|                     log.explain(f"{fmt_real_path(path)} has older mtime") | ||||
|  | ||||
|         if cookie_jar_path is None: | ||||
|             log.explain("Couldn't find a suitable cookie file") | ||||
|             return | ||||
|  | ||||
|         log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") | ||||
|         try: | ||||
|             self._load_cookies_from_file(cookie_jar_path) | ||||
|         except Exception as e: | ||||
|             log.explain("Failed to load cookies") | ||||
|             log.explain(str(e)) | ||||
|  | ||||
|     def _save_cookies(self) -> None: | ||||
|         log.explain_topic("Saving cookies") | ||||
|  | ||||
|         try: | ||||
|             log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||
|             self._save_cookies_to_file(self._cookie_jar_path) | ||||
|         except Exception as e: | ||||
|             log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||
|             log.warn(str(e)) | ||||
|  | ||||
|     async def run(self) -> None: | ||||
|         self._request_count = 0 | ||||
|         self._cookie_jar = aiohttp.CookieJar() | ||||
|         self._load_cookies() | ||||
|  | ||||
|         async with aiohttp.ClientSession( | ||||
|                 headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||
|                 cookie_jar=self._cookie_jar, | ||||
|                 connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||
|                 timeout=ClientTimeout( | ||||
|                     # 30 minutes. No download in the history of downloads was longer than 30 minutes. | ||||
|                     # This is enough to transfer a 600 MB file over a 3 Mib/s connection. | ||||
|                     # Allowing an arbitrary value could be annoying for overnight batch jobs | ||||
|                     total=15 * 60, | ||||
|                     connect=self._http_timeout, | ||||
|                     sock_connect=self._http_timeout, | ||||
|                     sock_read=self._http_timeout, | ||||
|                 ) | ||||
|         ) as session: | ||||
|             self.session = session | ||||
|             try: | ||||
|                 await super().run() | ||||
|             finally: | ||||
|                 del self.session | ||||
|         log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") | ||||
|  | ||||
|         # They are saved in authenticate, but a final save won't hurt | ||||
|         self._save_cookies() | ||||
							
								
								
									
										3
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .kit_ilias_web_crawler import KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
|  | ||||
| __all__ = ["KitIliasWebCrawler", "KitIliasWebCrawlerSection"] | ||||
							
								
								
									
										132
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,132 @@ | ||||
| from enum import Enum | ||||
| from typing import Optional | ||||
|  | ||||
| _link_template_plain = "{{link}}" | ||||
| _link_template_fancy = """ | ||||
| <!DOCTYPE html> | ||||
| <html lang="en"> | ||||
|     <head> | ||||
|         <meta charset="UTF-8"> | ||||
|         <title>ILIAS - Link: {{name}}</title> | ||||
|         <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> | ||||
|     </head> | ||||
|  | ||||
|     <style> | ||||
|     * { | ||||
|         box-sizing: border-box; | ||||
|     } | ||||
|     .center-flex { | ||||
|         display: flex; | ||||
|         align-items: center; | ||||
|         justify-content: center; | ||||
|     } | ||||
|     body { | ||||
|         padding: 0; | ||||
|         margin: 0; | ||||
|         background-color: #f0f0f0; | ||||
|         font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; | ||||
|         height: 100vh; | ||||
|     } | ||||
|     .row { | ||||
|         background-color: white; | ||||
|         min-width: 500px; | ||||
|         max-width: 90vw; | ||||
|         display: flex; | ||||
|         padding: 1em; | ||||
|     } | ||||
|     .logo { | ||||
|         flex: 0 1; | ||||
|         margin-right: 1em; | ||||
|         fill: #009682; | ||||
|     } | ||||
|     .tile { | ||||
|         flex: 1 0; | ||||
|         display: flex; | ||||
|         flex-direction: column; | ||||
|         justify-content: center; | ||||
|     } | ||||
|     .top-row { | ||||
|         padding-bottom: 5px; | ||||
|         font-size: 15px; | ||||
|     } | ||||
|     a { | ||||
|         color: #009682; | ||||
|         text-decoration: none; | ||||
|     } | ||||
|     a:hover { | ||||
|         text-decoration: underline; | ||||
|     } | ||||
|     .bottom-row { | ||||
|         font-size: 13px; | ||||
|     } | ||||
|     .menu-button { | ||||
|         border: 1px solid black; | ||||
|         margin-left: 4em; | ||||
|         width: 25px; | ||||
|         height: 25px; | ||||
|         flex: 0 0 25px; | ||||
|         background-color: #b3e0da; | ||||
|         font-size: 13px; | ||||
|         color: #222; | ||||
|     } | ||||
|     </style> | ||||
|     <body class="center-flex"> | ||||
|         <div class="row"> | ||||
|             <div class="logo center-flex"> | ||||
|                 <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> | ||||
|                     <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/> | ||||
|                 </svg> | ||||
|             </div> | ||||
|             <div class="tile"> | ||||
|                 <div class="top-row"> | ||||
|                     <a href="{{link}}">{{name}}</a> | ||||
|                 </div> | ||||
|                 <div class="bottom-row">{{description}}</div> | ||||
|             </div> | ||||
|             <div class="menu-button center-flex"> ⯆ </div> | ||||
|         </div> | ||||
|     </body> | ||||
| </html> | ||||
| """.strip()  # noqa: E501 line too long | ||||
|  | ||||
| _link_template_internet_shortcut = """ | ||||
| [InternetShortcut] | ||||
| URL={{link}} | ||||
| """.strip() | ||||
|  | ||||
|  | ||||
| class Links(Enum): | ||||
|     IGNORE = "ignore" | ||||
|     PLAINTEXT = "plaintext" | ||||
|     FANCY = "fancy" | ||||
|     INTERNET_SHORTCUT = "internet-shortcut" | ||||
|  | ||||
|     def template(self) -> Optional[str]: | ||||
|         if self == self.FANCY: | ||||
|             return _link_template_fancy | ||||
|         elif self == self.PLAINTEXT: | ||||
|             return _link_template_plain | ||||
|         elif self == self.INTERNET_SHORTCUT: | ||||
|             return _link_template_internet_shortcut | ||||
|         elif self == self.IGNORE: | ||||
|             return None | ||||
|         raise ValueError("Missing switch case") | ||||
|  | ||||
|     def extension(self) -> Optional[str]: | ||||
|         if self == self.FANCY: | ||||
|             return ".html" | ||||
|         elif self == self.PLAINTEXT: | ||||
|             return ".txt" | ||||
|         elif self == self.INTERNET_SHORTCUT: | ||||
|             return ".url" | ||||
|         elif self == self.IGNORE: | ||||
|             return None | ||||
|         raise ValueError("Missing switch case") | ||||
|  | ||||
|     @staticmethod | ||||
|     def from_string(string: str) -> "Links": | ||||
|         try: | ||||
|             return Links(string) | ||||
|         except ValueError: | ||||
|             raise ValueError("must be one of 'ignore', 'plaintext'," | ||||
|                              " 'html', 'internet-shortcut'") | ||||
							
								
								
									
										91
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,91 @@ | ||||
| from bs4 import BeautifulSoup, Comment, Tag | ||||
|  | ||||
| _STYLE_TAG_CONTENT = """ | ||||
|     .ilc_text_block_Information { | ||||
|       background-color: #f5f7fa; | ||||
|     } | ||||
|     div.ilc_text_block_Standard { | ||||
|       margin-bottom: 10px; | ||||
|       margin-top: 10px; | ||||
|     } | ||||
|     span.ilc_text_inline_Strong { | ||||
|       font-weight: bold; | ||||
|     } | ||||
|  | ||||
|     .accordion-head { | ||||
|       background-color: #f5f7fa; | ||||
|       padding: 0.5rem 0; | ||||
|     } | ||||
|  | ||||
|     h3 { | ||||
|       margin-top: 0.5rem; | ||||
|       margin-bottom: 1rem; | ||||
|     } | ||||
|  | ||||
|     br.visible-break { | ||||
|       margin-bottom: 1rem; | ||||
|     } | ||||
|  | ||||
|     article { | ||||
|       margin: 0.5rem 0; | ||||
|     } | ||||
|  | ||||
|     body { | ||||
|       padding: 1em; | ||||
|       grid-template-columns: 1fr min(60rem, 90%) 1fr; | ||||
|       line-height: 1.2; | ||||
|     } | ||||
| """ | ||||
|  | ||||
| _ARTICLE_WORTHY_CLASSES = [ | ||||
|     "ilc_text_block_Information", | ||||
|     "ilc_section_Attention", | ||||
|     "ilc_section_Link", | ||||
| ] | ||||
|  | ||||
|  | ||||
| def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|     head = soup.new_tag("head") | ||||
|     soup.insert(0, head) | ||||
|  | ||||
|     simplecss_link: Tag = soup.new_tag("link") | ||||
|     # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> | ||||
|     simplecss_link["rel"] = "stylesheet" | ||||
|     simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" | ||||
|     head.append(simplecss_link) | ||||
|  | ||||
|     # Basic style tags for compat | ||||
|     style: Tag = soup.new_tag("style") | ||||
|     style.append(_STYLE_TAG_CONTENT) | ||||
|     head.append(style) | ||||
|  | ||||
|     return soup | ||||
|  | ||||
|  | ||||
| def clean(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|     for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): | ||||
|         block.name = "article" | ||||
|  | ||||
|     for block in soup.find_all("h3"): | ||||
|         block.name = "div" | ||||
|  | ||||
|     for block in soup.find_all("h1"): | ||||
|         block.name = "h3" | ||||
|  | ||||
|     for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): | ||||
|         block.name = "h3" | ||||
|         block["class"] += ["accordion-head"] | ||||
|  | ||||
|     for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): | ||||
|         children = list(dummy.children) | ||||
|         if not children: | ||||
|             dummy.decompose() | ||||
|         if len(children) > 1: | ||||
|             continue | ||||
|         if type(children[0]) == Comment: | ||||
|             dummy.decompose() | ||||
|  | ||||
|     for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): | ||||
|         hrule_imposter.insert(0, soup.new_tag("hr")) | ||||
|  | ||||
|     return soup | ||||
							
								
								
									
										997
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										997
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,997 @@ | ||||
| import json | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from datetime import date, datetime, timedelta | ||||
| from enum import Enum | ||||
| from typing import Dict, List, Optional, Union | ||||
| from urllib.parse import urljoin, urlparse | ||||
|  | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from PFERD.logging import log | ||||
| from PFERD.utils import url_set_query_params | ||||
|  | ||||
| TargetType = Union[str, int] | ||||
|  | ||||
|  | ||||
| class IliasElementType(Enum): | ||||
|     EXERCISE = "exercise" | ||||
|     EXERCISE_FILES = "exercise_files"  # own submitted files | ||||
|     TEST = "test"                      # an online test. Will be ignored currently. | ||||
|     FILE = "file" | ||||
|     FOLDER = "folder" | ||||
|     FORUM = "forum" | ||||
|     LINK = "link" | ||||
|     BOOKING = "booking" | ||||
|     MEETING = "meeting" | ||||
|     VIDEO = "video" | ||||
|     VIDEO_PLAYER = "video_player" | ||||
|     VIDEO_FOLDER = "video_folder" | ||||
|     VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasPageElement: | ||||
|     type: IliasElementType | ||||
|     url: str | ||||
|     name: str | ||||
|     mtime: Optional[datetime] = None | ||||
|     description: Optional[str] = None | ||||
|  | ||||
|     def id(self) -> str: | ||||
|         regexes = [ | ||||
|             r"eid=(?P<id>[0-9a-z\-]+)", | ||||
|             r"file_(?P<id>\d+)", | ||||
|             r"ref_id=(?P<id>\d+)", | ||||
|             r"target=[a-z]+_(?P<id>\d+)" | ||||
|         ] | ||||
|  | ||||
|         for regex in regexes: | ||||
|             if match := re.search(regex, self.url): | ||||
|                 return match.groupdict()["id"] | ||||
|  | ||||
|         # Fall back to URL | ||||
|         log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") | ||||
|         return self.url | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasDownloadForumData: | ||||
|     url: str | ||||
|     form_data: Dict[str, Union[str, List[str]]] | ||||
|     empty: bool | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasForumThread: | ||||
|     title: str | ||||
|     title_tag: Tag | ||||
|     content_tag: Tag | ||||
|     mtime: Optional[datetime] | ||||
|  | ||||
|  | ||||
| class IliasPage: | ||||
|  | ||||
|     def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): | ||||
|         self._soup = soup | ||||
|         self._page_url = _page_url | ||||
|         self._page_type = source_element.type if source_element else None | ||||
|         self._source_name = source_element.name if source_element else "" | ||||
|  | ||||
|     def get_child_elements(self) -> List[IliasPageElement]: | ||||
|         """ | ||||
|         Return all child page elements you can find here. | ||||
|         """ | ||||
|         if self._is_video_player(): | ||||
|             log.explain("Page is a video player, extracting URL") | ||||
|             return self._player_to_video() | ||||
|         if self._is_video_listing(): | ||||
|             log.explain("Page is a video listing, searching for elements") | ||||
|             return self._find_video_entries() | ||||
|         if self._is_exercise_file(): | ||||
|             log.explain("Page is an exercise, searching for elements") | ||||
|             return self._find_exercise_entries() | ||||
|         if self._is_personal_desktop(): | ||||
|             log.explain("Page is the personal desktop, searching for elements") | ||||
|             return self._find_personal_desktop_entries() | ||||
|         if self._is_content_page(): | ||||
|             log.explain("Page is a content page, searching for elements") | ||||
|             return self._find_copa_entries() | ||||
|         log.explain("Page is a normal folder, searching for elements") | ||||
|         return self._find_normal_entries() | ||||
|  | ||||
|     def get_description(self) -> Optional[BeautifulSoup]: | ||||
|         def is_interesting_class(name: str) -> bool: | ||||
|             return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] | ||||
|  | ||||
|         paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) | ||||
|         if not paragraphs: | ||||
|             return None | ||||
|  | ||||
|         # Extract bits and pieces into a string and parse it again. | ||||
|         # This ensures we don't miss anything and weird structures are resolved | ||||
|         # somewhat gracefully. | ||||
|         raw_html = "" | ||||
|         for p in paragraphs: | ||||
|             if p.find_parent(class_=is_interesting_class): | ||||
|                 continue | ||||
|  | ||||
|             # Ignore special listings (like folder groupings) | ||||
|             if "ilc_section_Special" in p["class"]: | ||||
|                 continue | ||||
|  | ||||
|             raw_html += str(p) + "\n" | ||||
|         raw_html = f"<body>\n{raw_html}\n</body>" | ||||
|  | ||||
|         return BeautifulSoup(raw_html, "html.parser") | ||||
|  | ||||
|     def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: | ||||
|         form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) | ||||
|         if not form: | ||||
|             return None | ||||
|         post_url = self._abs_url_from_relative(form["action"]) | ||||
|  | ||||
|         thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] | ||||
|  | ||||
|         form_data: Dict[str, Union[str, List[ſtr]]] = { | ||||
|             "thread_ids[]": thread_ids, | ||||
|             "selected_cmd2": "html", | ||||
|             "select_cmd2": "Ausführen", | ||||
|             "selected_cmd": "", | ||||
|         } | ||||
|  | ||||
|         return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) | ||||
|  | ||||
|     def get_next_stage_element(self) -> Optional[IliasPageElement]: | ||||
|         if self._is_forum_page(): | ||||
|             if "trows=800" in self._page_url: | ||||
|                 return None | ||||
|             log.explain("Requesting *all* forum threads") | ||||
|             return self._get_show_max_forum_entries_per_page_url() | ||||
|         if self._is_ilias_opencast_embedding(): | ||||
|             log.explain("Unwrapping opencast embedding") | ||||
|             return self.get_child_elements()[0] | ||||
|         if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: | ||||
|             log.explain("Unwrapping video pagination") | ||||
|             return self._find_video_entries_paginated()[0] | ||||
|         if self._contains_collapsed_future_meetings(): | ||||
|             log.explain("Requesting *all* future meetings") | ||||
|             return self._uncollapse_future_meetings_url() | ||||
|         return None | ||||
|  | ||||
|     def _is_forum_page(self) -> bool: | ||||
|         read_more_btn = self._soup.find( | ||||
|             "button", | ||||
|             attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} | ||||
|         ) | ||||
|         return read_more_btn is not None | ||||
|  | ||||
|     def _is_video_player(self) -> bool: | ||||
|         return "paella_config_file" in str(self._soup) | ||||
|  | ||||
|     def _is_video_listing(self) -> bool: | ||||
|         if self._is_ilias_opencast_embedding(): | ||||
|             return True | ||||
|  | ||||
|         # Raw listing without ILIAS fluff | ||||
|         video_element_table: Tag = self._soup.find( | ||||
|             name="table", id=re.compile(r"tbl_xoct_.+") | ||||
|         ) | ||||
|         return video_element_table is not None | ||||
|  | ||||
|     def _is_ilias_opencast_embedding(self) -> bool: | ||||
|         # ILIAS fluff around the real opencast html | ||||
|         if self._soup.find(id="headerimage"): | ||||
|             element: Tag = self._soup.find(id="headerimage") | ||||
|             if "opencast" in element.attrs["src"].lower(): | ||||
|                 return True | ||||
|         return False | ||||
|  | ||||
|     def _is_exercise_file(self) -> bool: | ||||
|         # we know it from before | ||||
|         if self._page_type == IliasElementType.EXERCISE: | ||||
|             return True | ||||
|  | ||||
|         # We have no suitable parent - let's guesss | ||||
|         if self._soup.find(id="headerimage"): | ||||
|             element: Tag = self._soup.find(id="headerimage") | ||||
|             if "exc" in element.attrs["src"].lower(): | ||||
|                 return True | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _is_personal_desktop(self) -> bool: | ||||
|         return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) | ||||
|  | ||||
|     def _is_content_page(self) -> bool: | ||||
|         link = self._soup.find(id="current_perma_link") | ||||
|         if not link: | ||||
|             return False | ||||
|         return "target=copa_" in link.get("value") | ||||
|  | ||||
|     def _contains_collapsed_future_meetings(self) -> bool: | ||||
|         return self._uncollapse_future_meetings_url() is not None | ||||
|  | ||||
|     def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: | ||||
|         element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) | ||||
|         if not element: | ||||
|             return None | ||||
|         link = self._abs_url_from_link(element) | ||||
|         return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") | ||||
|  | ||||
|     def _player_to_video(self) -> List[IliasPageElement]: | ||||
|         # Fetch the actual video page. This is a small wrapper page initializing a javscript | ||||
|         # player. Sadly we can not execute that JS. The actual video stream url is nowhere | ||||
|         # on the page, but defined in a JS object inside a script tag, passed to the player | ||||
|         # library. | ||||
|         # We do the impossible and RegEx the stream JSON object out of the page's HTML source | ||||
|         regex = re.compile( | ||||
|             r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE | ||||
|         ) | ||||
|         json_match = regex.search(str(self._soup)) | ||||
|  | ||||
|         if json_match is None: | ||||
|             log.warn("Could not find JSON stream info in video player. Ignoring video.") | ||||
|             return [] | ||||
|         json_str = json_match.group(1) | ||||
|  | ||||
|         # parse it | ||||
|         json_object = json.loads(json_str) | ||||
|         streams = [stream for stream in json_object["streams"]] | ||||
|  | ||||
|         # and just fetch the lone video url! | ||||
|         if len(streams) == 1: | ||||
|             video_url = streams[0]["sources"]["mp4"][0]["src"] | ||||
|             return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] | ||||
|  | ||||
|         log.explain(f"Found multiple videos for stream at {self._source_name}") | ||||
|         items = [] | ||||
|         for stream in sorted(streams, key=lambda stream: stream["content"]): | ||||
|             full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" | ||||
|             video_url = stream["sources"]["mp4"][0]["src"] | ||||
|             items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: | ||||
|         correct_link = self._soup.find( | ||||
|             "a", | ||||
|             attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} | ||||
|         ) | ||||
|  | ||||
|         if not correct_link: | ||||
|             return None | ||||
|  | ||||
|         link = self._abs_url_from_link(correct_link) | ||||
|  | ||||
|         return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") | ||||
|  | ||||
|     def _find_personal_desktop_entries(self) -> List[IliasPageElement]: | ||||
|         items: List[IliasPageElement] = [] | ||||
|  | ||||
|         titles: List[Tag] = self._soup.select(".il-item-title") | ||||
|         for title in titles: | ||||
|             link = title.find("a") | ||||
|             name = _sanitize_path_name(link.text.strip()) | ||||
|             url = self._abs_url_from_link(link) | ||||
|  | ||||
|             type = self._find_type_from_link(name, link, url) | ||||
|             if not type: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not extract type for {link}") | ||||
|                 continue | ||||
|  | ||||
|             log.explain(f"Found {name!r}") | ||||
|  | ||||
|             if type == IliasElementType.FILE and "_download" not in url: | ||||
|                 url = re.sub(r"(target=file_\d+)", r"\1_download", url) | ||||
|                 log.explain("Rewired file URL to include download part") | ||||
|  | ||||
|             items.append(IliasPageElement(type, url, name)) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _find_copa_entries(self) -> List[IliasPageElement]: | ||||
|         items: List[IliasPageElement] = [] | ||||
|         links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") | ||||
|  | ||||
|         for link in links: | ||||
|             url = self._abs_url_from_link(link) | ||||
|             name = _sanitize_path_name(link.getText().strip().replace("\t", "")) | ||||
|  | ||||
|             if "file_id" not in url: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}") | ||||
|                 continue | ||||
|  | ||||
|             items.append(IliasPageElement(IliasElementType.FILE, url, name)) | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _find_video_entries(self) -> List[IliasPageElement]: | ||||
|         # ILIAS has three stages for video pages | ||||
|         # 1. The initial dummy page without any videos. This page contains the link to the listing | ||||
|         # 2. The video listing which might be paginated | ||||
|         # 3. An unpaginated video listing (or at least one that includes 800 videos) | ||||
|         # | ||||
|         # We need to figure out where we are. | ||||
|  | ||||
|         video_element_table: Tag = self._soup.find( | ||||
|             name="table", id=re.compile(r"tbl_xoct_.+") | ||||
|         ) | ||||
|  | ||||
|         if video_element_table is None: | ||||
|             # We are in stage 1 | ||||
|             # The page is actually emtpy but contains the link to stage 2 | ||||
|             content_link: Tag = self._soup.select_one("#tab_series a") | ||||
|             url: str = self._abs_url_from_link(content_link) | ||||
|             query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||
|             url = url_set_query_params(url, query_params) | ||||
|             log.explain("Found ILIAS video frame page, fetching actual content next") | ||||
|             return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] | ||||
|  | ||||
|         is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None | ||||
|  | ||||
|         if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: | ||||
|             # We are in stage 2 - try to break pagination | ||||
|             return self._find_video_entries_paginated() | ||||
|  | ||||
|         return self._find_video_entries_no_paging() | ||||
|  | ||||
|     def _find_video_entries_paginated(self) -> List[IliasPageElement]: | ||||
|         table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) | ||||
|  | ||||
|         if table_element is None: | ||||
|             log.warn("Couldn't increase elements per page (table not found). I might miss elements.") | ||||
|             return self._find_video_entries_no_paging() | ||||
|  | ||||
|         id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) | ||||
|         if id_match is None: | ||||
|             log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") | ||||
|             return self._find_video_entries_no_paging() | ||||
|  | ||||
|         table_id = id_match.group(1) | ||||
|  | ||||
|         query_params = {f"tbl_xoct_{table_id}_trows": "800", | ||||
|                         "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||
|         url = url_set_query_params(self._page_url, query_params) | ||||
|  | ||||
|         log.explain("Disabled pagination, retrying folder as a new entry") | ||||
|         return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] | ||||
|  | ||||
|     def _find_video_entries_no_paging(self) -> List[IliasPageElement]: | ||||
|         """ | ||||
|         Crawls the "second stage" video page. This page contains the actual video urls. | ||||
|         """ | ||||
|         # Video start links are marked with an "Abspielen" link | ||||
|         video_links: List[Tag] = self._soup.findAll( | ||||
|             name="a", text=re.compile(r"\s*Abspielen\s*") | ||||
|         ) | ||||
|  | ||||
|         results: List[IliasPageElement] = [] | ||||
|  | ||||
|         for link in video_links: | ||||
|             results.append(self._listed_video_to_element(link)) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _listed_video_to_element(self, link: Tag) -> IliasPageElement: | ||||
|         # The link is part of a table with multiple columns, describing metadata. | ||||
|         # 6th or 7th child (1 indexed) is the modification time string. Try to find it | ||||
|         # by parsing backwards from the end and finding something that looks like a date | ||||
|         modification_time = None | ||||
|         row: Tag = link.parent.parent.parent | ||||
|         column_count = len(row.select("td.std")) | ||||
|         for index in range(column_count, 0, -1): | ||||
|             modification_string = link.parent.parent.parent.select_one( | ||||
|                 f"td.std:nth-child({index})" | ||||
|             ).getText().strip() | ||||
|             if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): | ||||
|                 modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") | ||||
|                 break | ||||
|  | ||||
|         if modification_time is None: | ||||
|             log.warn(f"Could not determine upload time for {link}") | ||||
|             modification_time = datetime.now() | ||||
|  | ||||
|         title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() | ||||
|         title += ".mp4" | ||||
|  | ||||
|         video_name: str = _sanitize_path_name(title) | ||||
|  | ||||
|         video_url = self._abs_url_from_link(link) | ||||
|  | ||||
|         log.explain(f"Found video {video_name!r} at {video_url}") | ||||
|         return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) | ||||
|  | ||||
|     def _find_exercise_entries(self) -> List[IliasPageElement]: | ||||
|         if self._soup.find(id="tab_submission"): | ||||
|             log.explain("Found submission tab. This is an exercise detail page") | ||||
|             return self._find_exercise_entries_detail_page() | ||||
|         log.explain("Found no submission tab. This is an exercise root page") | ||||
|         return self._find_exercise_entries_root_page() | ||||
|  | ||||
|     def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: | ||||
|         results: List[IliasPageElement] = [] | ||||
|  | ||||
|         # Find all download links in the container (this will contain all the files) | ||||
|         download_links: List[Tag] = self._soup.findAll( | ||||
|             name="a", | ||||
|             # download links contain the given command class | ||||
|             attrs={"href": lambda x: x and "cmd=download" in x}, | ||||
|             text="Download" | ||||
|         ) | ||||
|  | ||||
|         for link in download_links: | ||||
|             parent_row: Tag = link.findParent("tr") | ||||
|             children: List[Tag] = parent_row.findChildren("td") | ||||
|  | ||||
|             name = _sanitize_path_name(children[1].getText().strip()) | ||||
|             log.explain(f"Found exercise detail entry {name!r}") | ||||
|  | ||||
|             for child in reversed(children): | ||||
|                 date = demangle_date(child.getText().strip(), fail_silently=True) | ||||
|                 if date is not None: | ||||
|                     break | ||||
|             if date is None: | ||||
|                 log.warn(f"Date parsing failed for exercise entry {name!r}") | ||||
|  | ||||
|             results.append(IliasPageElement( | ||||
|                 IliasElementType.FILE, | ||||
|                 self._abs_url_from_link(link), | ||||
|                 name, | ||||
|                 date | ||||
|             )) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _find_exercise_entries_root_page(self) -> List[IliasPageElement]: | ||||
|         results: List[IliasPageElement] = [] | ||||
|  | ||||
|         # Each assignment is in an accordion container | ||||
|         assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") | ||||
|  | ||||
|         for container in assignment_containers: | ||||
|             # Fetch the container name out of the header to use it in the path | ||||
|             container_name = container.select_one(".ilAssignmentHeader").getText().strip() | ||||
|             log.explain(f"Found exercise container {container_name!r}") | ||||
|  | ||||
|             # Find all download links in the container (this will contain all the files) | ||||
|             files: List[Tag] = container.findAll( | ||||
|                 name="a", | ||||
|                 # download links contain the given command class | ||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, | ||||
|                 text="Download" | ||||
|             ) | ||||
|  | ||||
|             # Grab each file as you now have the link | ||||
|             for file_link in files: | ||||
|                 # Two divs, side by side. Left is the name, right is the link ==> get left | ||||
|                 # sibling | ||||
|                 file_name = file_link.parent.findPrevious(name="div").getText().strip() | ||||
|                 file_name = _sanitize_path_name(file_name) | ||||
|                 url = self._abs_url_from_link(file_link) | ||||
|  | ||||
|                 log.explain(f"Found exercise entry {file_name!r}") | ||||
|                 results.append(IliasPageElement( | ||||
|                     IliasElementType.FILE, | ||||
|                     url, | ||||
|                     container_name + "/" + file_name, | ||||
|                     None  # We do not have any timestamp | ||||
|                 )) | ||||
|  | ||||
|             # Find all links to file listings (e.g. "Submitted Files" for groups) | ||||
|             file_listings: List[Tag] = container.findAll( | ||||
|                 name="a", | ||||
|                 # download links contain the given command class | ||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} | ||||
|             ) | ||||
|  | ||||
|             # Add each listing as a new | ||||
|             for listing in file_listings: | ||||
|                 parent_container: Tag = listing.findParent( | ||||
|                     "div", attrs={"class": lambda x: x and "form-group" in x} | ||||
|                 ) | ||||
|                 label_container: Tag = parent_container.find( | ||||
|                     attrs={"class": lambda x: x and "control-label" in x} | ||||
|                 ) | ||||
|                 file_name = _sanitize_path_name(label_container.getText().strip()) | ||||
|                 url = self._abs_url_from_link(listing) | ||||
|                 log.explain(f"Found exercise detail {file_name!r} at {url}") | ||||
|                 results.append(IliasPageElement( | ||||
|                     IliasElementType.EXERCISE_FILES, | ||||
|                     url, | ||||
|                     container_name + "/" + file_name, | ||||
|                     None  # we do not have any timestamp | ||||
|                 )) | ||||
|  | ||||
|         return results | ||||
|  | ||||
|     def _find_normal_entries(self) -> List[IliasPageElement]: | ||||
|         result: List[IliasPageElement] = [] | ||||
|  | ||||
|         # Fetch all links and throw them to the general interpreter | ||||
|         links: List[Tag] = self._soup.select("a.il_ContainerItemTitle") | ||||
|  | ||||
|         for link in links: | ||||
|             abs_url = self._abs_url_from_link(link) | ||||
|             parents = self._find_upwards_folder_hierarchy(link) | ||||
|  | ||||
|             if parents: | ||||
|                 element_name = "/".join(parents) + "/" + _sanitize_path_name(link.getText()) | ||||
|             else: | ||||
|                 element_name = _sanitize_path_name(link.getText()) | ||||
|  | ||||
|             element_type = self._find_type_from_link(element_name, link, abs_url) | ||||
|             description = self._find_link_description(link) | ||||
|  | ||||
|             # The last meeting on every page is expanded by default. | ||||
|             # Its content is then shown inline *and* in the meeting page itself. | ||||
|             # We should skip the inline content. | ||||
|             if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(link): | ||||
|                 continue | ||||
|  | ||||
|             if not element_type: | ||||
|                 continue | ||||
|             if element_type == IliasElementType.MEETING: | ||||
|                 normalized = _sanitize_path_name(self._normalize_meeting_name(element_name)) | ||||
|                 log.explain(f"Normalized meeting name from {element_name!r} to {normalized!r}") | ||||
|                 element_name = normalized | ||||
|             elif element_type == IliasElementType.FILE: | ||||
|                 result.append(self._file_to_element(element_name, abs_url, link)) | ||||
|                 continue | ||||
|  | ||||
|             log.explain(f"Found {element_name!r}") | ||||
|             result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) | ||||
|  | ||||
|         result += self._find_cards() | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _is_in_expanded_meeting(self, tag: Tag) -> bool: | ||||
|         """ | ||||
|         Returns whether a file is part of an expanded meeting. | ||||
|         Has false positives for meetings themselves as their title is also "in the expanded meeting content". | ||||
|         It is in the same general div and this whole thing is guesswork. | ||||
|         Therefore, you should check for meetings before passing them in this function. | ||||
|         """ | ||||
|         parents: List[Tag] = list(tag.parents) | ||||
|         for parent in parents: | ||||
|             if not parent.get("class"): | ||||
|                 continue | ||||
|  | ||||
|             # We should not crawl files under meetings | ||||
|             if "ilContainerListItemContentCB" in parent.get("class"): | ||||
|                 link: Tag = parent.parent.find("a") | ||||
|                 type = IliasPage._find_type_from_folder_like(link, self._page_url) | ||||
|                 return type == IliasElementType.MEETING | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: | ||||
|         """ | ||||
|         Interprets accordions and expandable blocks as virtual folders and returns them | ||||
|         in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test" | ||||
|         """ | ||||
|         found_titles = [] | ||||
|  | ||||
|         outer_accordion_content: Optional[Tag] = None | ||||
|  | ||||
|         parents: List[Tag] = list(tag.parents) | ||||
|         for parent in parents: | ||||
|             if not parent.get("class"): | ||||
|                 continue | ||||
|  | ||||
|             # ILIAS has proper accordions and weird blocks that look like normal headings, | ||||
|             # but some JS later transforms them into an accordion. | ||||
|  | ||||
|             # This is for these weird JS-y blocks | ||||
|             if "ilContainerItemsContainer" in parent.get("class"): | ||||
|                 # I am currently under the impression that *only* those JS blocks have an | ||||
|                 # ilNoDisplay class. | ||||
|                 if "ilNoDisplay" not in parent.get("class"): | ||||
|                     continue | ||||
|                 prev: Tag = parent.findPreviousSibling("div") | ||||
|                 if "ilContainerBlockHeader" in prev.get("class"): | ||||
|                     if prev.find("h3"): | ||||
|                         found_titles.append(prev.find("h3").getText().strip()) | ||||
|                     else: | ||||
|                         found_titles.append(prev.find("h2").getText().strip()) | ||||
|  | ||||
|             # And this for real accordions | ||||
|             if "il_VAccordionContentDef" in parent.get("class"): | ||||
|                 outer_accordion_content = parent | ||||
|                 break | ||||
|  | ||||
|         if outer_accordion_content: | ||||
|             accordion_tag: Tag = outer_accordion_content.parent | ||||
|             head_tag: Tag = accordion_tag.find(attrs={ | ||||
|                 "class": lambda x: x and "ilc_va_ihead_VAccordIHead" in x | ||||
|             }) | ||||
|             found_titles.append(head_tag.getText().strip()) | ||||
|  | ||||
|         return [_sanitize_path_name(x) for x in reversed(found_titles)] | ||||
|  | ||||
|     def _find_link_description(self, link: Tag) -> Optional[str]: | ||||
|         tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) | ||||
|         if not tile: | ||||
|             return None | ||||
|         description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x}) | ||||
|         if not description_element: | ||||
|             return None | ||||
|         return description_element.getText().strip() | ||||
|  | ||||
|     def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: | ||||
|         # Files have a list of properties (type, modification date, size, etc.) | ||||
|         # In a series of divs. | ||||
|         # Find the parent containing all those divs, so we can filter our what we need | ||||
|         properties_parent: Tag = link_element.findParent( | ||||
|             "div", {"class": lambda x: "il_ContainerListItem" in x} | ||||
|         ).select_one(".il_ItemProperties") | ||||
|         # The first one is always the filetype | ||||
|         file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() | ||||
|  | ||||
|         # The rest does not have a stable order. Grab the whole text and reg-ex the date | ||||
|         # out of it | ||||
|         all_properties_text = properties_parent.getText().strip() | ||||
|         modification_date_match = re.search( | ||||
|             r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", | ||||
|             all_properties_text | ||||
|         ) | ||||
|         if modification_date_match is None: | ||||
|             modification_date = None | ||||
|             log.explain(f"Element {name} at {url} has no date.") | ||||
|         else: | ||||
|             modification_date_str = modification_date_match.group(1) | ||||
|             modification_date = demangle_date(modification_date_str) | ||||
|  | ||||
|         # Grab the name from the link text | ||||
|         full_path = name + "." + file_type | ||||
|  | ||||
|         log.explain(f"Found file {full_path!r}") | ||||
|         return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) | ||||
|  | ||||
|     def _find_cards(self) -> List[IliasPageElement]: | ||||
|         result: List[IliasPageElement] = [] | ||||
|  | ||||
|         card_titles: List[Tag] = self._soup.select(".card-title a") | ||||
|  | ||||
|         for title in card_titles: | ||||
|             url = self._abs_url_from_link(title) | ||||
|             name = _sanitize_path_name(title.getText().strip()) | ||||
|             type = self._find_type_from_card(title) | ||||
|  | ||||
|             if not type: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not extract type for {title}") | ||||
|                 continue | ||||
|  | ||||
|             result.append(IliasPageElement(type, url, name)) | ||||
|  | ||||
|         card_button_tiles: List[Tag] = self._soup.select(".card-title button") | ||||
|  | ||||
|         for button in card_button_tiles: | ||||
|             regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") | ||||
|             res = regex.search(str(self._soup)) | ||||
|             if not res: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not find click handler target for {button}") | ||||
|                 continue | ||||
|             url = self._abs_url_from_relative(res.group(1)) | ||||
|             name = _sanitize_path_name(button.getText().strip()) | ||||
|             type = self._find_type_from_card(button) | ||||
|             caption_parent = button.findParent( | ||||
|                 "div", | ||||
|                 attrs={"class": lambda x: x and "caption" in x}, | ||||
|             ) | ||||
|             description = caption_parent.find_next_sibling("div").getText().strip() | ||||
|  | ||||
|             if not type: | ||||
|                 _unexpected_html_warning() | ||||
|                 log.warn_contd(f"Could not extract type for {button}") | ||||
|                 continue | ||||
|  | ||||
|             result.append(IliasPageElement(type, url, name, description=description)) | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]: | ||||
|         def is_card_root(element: Tag) -> bool: | ||||
|             return "il-card" in element["class"] and "thumbnail" in element["class"] | ||||
|  | ||||
|         card_root: Optional[Tag] = None | ||||
|  | ||||
|         # We look for the card root | ||||
|         for parent in card_title.parents: | ||||
|             if is_card_root(parent): | ||||
|                 card_root = parent | ||||
|                 break | ||||
|  | ||||
|         if card_root is None: | ||||
|             _unexpected_html_warning() | ||||
|             log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}") | ||||
|             return None | ||||
|  | ||||
|         icon: Tag = card_root.select_one(".il-card-repository-head .icon") | ||||
|  | ||||
|         if "opencast" in icon["class"]: | ||||
|             return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED | ||||
|         if "exc" in icon["class"]: | ||||
|             return IliasElementType.EXERCISE | ||||
|         if "webr" in icon["class"]: | ||||
|             return IliasElementType.LINK | ||||
|         if "book" in icon["class"]: | ||||
|             return IliasElementType.BOOKING | ||||
|         if "frm" in icon["class"]: | ||||
|             return IliasElementType.FORUM | ||||
|         if "sess" in icon["class"]: | ||||
|             return IliasElementType.MEETING | ||||
|         if "tst" in icon["class"]: | ||||
|             return IliasElementType.TEST | ||||
|         if "fold" in icon["class"]: | ||||
|             return IliasElementType.FOLDER | ||||
|  | ||||
|         _unexpected_html_warning() | ||||
|         log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") | ||||
|         return None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_type_from_link( | ||||
|             element_name: str, | ||||
|             link_element: Tag, | ||||
|             url: str | ||||
|     ) -> Optional[IliasElementType]: | ||||
|         """ | ||||
|         Decides which sub crawler to use for a given top level element. | ||||
|         """ | ||||
|         parsed_url = urlparse(url) | ||||
|  | ||||
|         # file URLs contain "target=file" | ||||
|         if "target=file_" in parsed_url.query: | ||||
|             return IliasElementType.FILE | ||||
|  | ||||
|         if "target=grp_" in parsed_url.query: | ||||
|             return IliasElementType.FOLDER | ||||
|  | ||||
|         if "target=crs_" in parsed_url.query: | ||||
|             return IliasElementType.FOLDER | ||||
|  | ||||
|         if "baseClass=ilExerciseHandlerGUI" in parsed_url.query: | ||||
|             return IliasElementType.EXERCISE | ||||
|  | ||||
|         if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query: | ||||
|             return IliasElementType.LINK | ||||
|  | ||||
|         if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query: | ||||
|             return IliasElementType.FORUM | ||||
|  | ||||
|         if "cmdClass=ilobjtestgui" in parsed_url.query: | ||||
|             return IliasElementType.TEST | ||||
|  | ||||
|         # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so | ||||
|         # try to guess it from the image. | ||||
|  | ||||
|         # Everything with a ref_id can *probably* be opened to reveal nested things | ||||
|         # video groups, directories, exercises, etc | ||||
|         if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path: | ||||
|             return IliasPage._find_type_from_folder_like(link_element, url) | ||||
|  | ||||
|         _unexpected_html_warning() | ||||
|         log.warn_contd( | ||||
|             f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})" | ||||
|         ) | ||||
|         return None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: | ||||
|         """ | ||||
|         Try crawling something that looks like a folder. | ||||
|         """ | ||||
|         # pylint: disable=too-many-return-statements | ||||
|  | ||||
|         found_parent: Optional[Tag] = None | ||||
|  | ||||
|         # We look for the outer div of our inner link, to find information around it | ||||
|         # (mostly the icon) | ||||
|         for parent in link_element.parents: | ||||
|             if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]: | ||||
|                 found_parent = parent | ||||
|                 break | ||||
|  | ||||
|         if found_parent is None: | ||||
|             _unexpected_html_warning() | ||||
|             log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}") | ||||
|             return None | ||||
|  | ||||
|         # Find the small descriptive icon to figure out the type | ||||
|         img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") | ||||
|  | ||||
|         if img_tag is None: | ||||
|             img_tag = found_parent.select_one("img.icon") | ||||
|  | ||||
|         if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): | ||||
|             log.explain("Found session expansion button, skipping it as it has no content") | ||||
|             return None | ||||
|  | ||||
|         if img_tag is None: | ||||
|             _unexpected_html_warning() | ||||
|             log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") | ||||
|             return None | ||||
|  | ||||
|         if "opencast" in str(img_tag["alt"]).lower(): | ||||
|             return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_exc.svg"): | ||||
|             return IliasElementType.EXERCISE | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_webr.svg"): | ||||
|             return IliasElementType.LINK | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_book.svg"): | ||||
|             return IliasElementType.BOOKING | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("frm.svg"): | ||||
|             return IliasElementType.FORUM | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("sess.svg"): | ||||
|             return IliasElementType.MEETING | ||||
|  | ||||
|         if str(img_tag["src"]).endswith("icon_tst.svg"): | ||||
|             return IliasElementType.TEST | ||||
|  | ||||
|         return IliasElementType.FOLDER | ||||
|  | ||||
|     @staticmethod | ||||
|     def _normalize_meeting_name(meeting_name: str) -> str: | ||||
|         """ | ||||
|         Normalizes meeting names, which have a relative time as their first part, | ||||
|         to their date in ISO format. | ||||
|         """ | ||||
|  | ||||
|         # This checks whether we can reach a `:` without passing a `-` | ||||
|         if re.search(r"^[^-]+: ", meeting_name): | ||||
|             # Meeting name only contains date: "05. Jan 2000:" | ||||
|             split_delimiter = ":" | ||||
|         else: | ||||
|             # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" | ||||
|             split_delimiter = ", " | ||||
|  | ||||
|         # We have a meeting day without time | ||||
|         date_portion_str = meeting_name.split(split_delimiter)[0] | ||||
|         date_portion = demangle_date(date_portion_str) | ||||
|  | ||||
|         # We failed to parse the date, bail out | ||||
|         if not date_portion: | ||||
|             return meeting_name | ||||
|  | ||||
|         # Replace the first section with the absolute date | ||||
|         rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) | ||||
|         return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name | ||||
|  | ||||
|     def _abs_url_from_link(self, link_tag: Tag) -> str: | ||||
|         """ | ||||
|         Create an absolute url from an <a> tag. | ||||
|         """ | ||||
|         return self._abs_url_from_relative(link_tag.get("href")) | ||||
|  | ||||
|     def _abs_url_from_relative(self, relative_url: str) -> str: | ||||
|         """ | ||||
|         Create an absolute url from a relative URL. | ||||
|         """ | ||||
|         return urljoin(self._page_url, relative_url) | ||||
|  | ||||
|  | ||||
| def _unexpected_html_warning() -> None: | ||||
|     log.warn("Encountered unexpected HTML structure, ignoring element.") | ||||
|  | ||||
|  | ||||
| german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] | ||||
| english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] | ||||
|  | ||||
|  | ||||
| def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: | ||||
|     """ | ||||
|     Demangle a given date in one of the following formats (hour/minute part is optional): | ||||
|     "Gestern, HH:MM" | ||||
|     "Heute, HH:MM" | ||||
|     "Morgen, HH:MM" | ||||
|     "dd. mon yyyy, HH:MM | ||||
|     """ | ||||
|     try: | ||||
|         # Normalize whitespace because users | ||||
|         date_str = re.sub(r"\s+", " ", date_str) | ||||
|  | ||||
|         date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) | ||||
|         date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) | ||||
|         date_str = re.sub("Morgen|Tomorrow",  _format_date_english(_tomorrow()), date_str, re.I) | ||||
|         date_str = date_str.strip() | ||||
|         for german, english in zip(german_months, english_months): | ||||
|             date_str = date_str.replace(german, english) | ||||
|             # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" | ||||
|             date_str = date_str.replace(english + ".", english) | ||||
|  | ||||
|         # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" | ||||
|  | ||||
|         # Check if we have a time as well | ||||
|         if ", " in date_str: | ||||
|             day_part, time_part = date_str.split(",") | ||||
|         else: | ||||
|             day_part = date_str.split(",")[0] | ||||
|             time_part = None | ||||
|  | ||||
|         day_str, month_str, year_str = day_part.split(" ") | ||||
|  | ||||
|         day = int(day_str.strip().replace(".", "")) | ||||
|         month = english_months.index(month_str.strip()) + 1 | ||||
|         year = int(year_str.strip()) | ||||
|  | ||||
|         if time_part: | ||||
|             hour_str, minute_str = time_part.split(":") | ||||
|             hour = int(hour_str) | ||||
|             minute = int(minute_str) | ||||
|             return datetime(year, month, day, hour, minute) | ||||
|  | ||||
|         return datetime(year, month, day) | ||||
|     except Exception: | ||||
|         if not fail_silently: | ||||
|             log.warn(f"Date parsing failed for {date_str!r}") | ||||
|         return None | ||||
|  | ||||
|  | ||||
| def _format_date_english(date_to_format: date) -> str: | ||||
|     month = english_months[date_to_format.month - 1] | ||||
|     return f"{date_to_format.day:02d}. {month} {date_to_format.year:04d}" | ||||
|  | ||||
|  | ||||
| def _yesterday() -> date: | ||||
|     return date.today() - timedelta(days=1) | ||||
|  | ||||
|  | ||||
| def _tomorrow() -> date: | ||||
|     return date.today() + timedelta(days=1) | ||||
|  | ||||
|  | ||||
| def _sanitize_path_name(name: str) -> str: | ||||
|     return name.replace("/", "-").replace("\\", "-").strip() | ||||
|  | ||||
|  | ||||
| def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]: | ||||
|     elements = [] | ||||
|     for p in forum_export.select("body > p"): | ||||
|         title_tag = p | ||||
|         content_tag = p.find_next_sibling("ul") | ||||
|  | ||||
|         if not content_tag: | ||||
|             # ILIAS allows users to delete the initial post while keeping the thread open | ||||
|             # This produces empty threads without *any* content. | ||||
|             # I am not sure why you would want this, but ILIAS makes it easy to do. | ||||
|             continue | ||||
|  | ||||
|         title = p.find("b").text | ||||
|         if ":" in title: | ||||
|             title = title[title.find(":") + 1:] | ||||
|         title = title.strip() | ||||
|         mtime = _guess_timestamp_from_forum_post_content(content_tag) | ||||
|         elements.append(IliasForumThread(title, title_tag, content_tag, mtime)) | ||||
|  | ||||
|     return elements | ||||
|  | ||||
|  | ||||
| def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]: | ||||
|     posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small") | ||||
|     if not posts: | ||||
|         return None | ||||
|  | ||||
|     newest_date: Optional[datetime] = None | ||||
|  | ||||
|     for post in posts: | ||||
|         text = post.text.strip() | ||||
|         text = text[text.rfind("|") + 1:] | ||||
|         date = demangle_date(text, fail_silently=True) | ||||
|         if not date: | ||||
|             continue | ||||
|  | ||||
|         if not newest_date or newest_date < date: | ||||
|             newest_date = date | ||||
|  | ||||
|     return newest_date | ||||
							
								
								
									
										969
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										969
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,969 @@ | ||||
| import asyncio | ||||
| import re | ||||
| from collections.abc import Awaitable, Coroutine | ||||
| from pathlib import PurePath | ||||
| from typing import Any, Callable, Dict, List, Optional, Set, Union, cast | ||||
|  | ||||
| import aiohttp | ||||
| import yarl | ||||
| from aiohttp import hdrs | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from ...auth import Authenticator, TfaAuthenticator | ||||
| from ...config import Config | ||||
| from ...logging import ProgressBar, log | ||||
| from ...output_dir import FileSink, Redownload | ||||
| from ...utils import fmt_path, soupify, url_set_query_param | ||||
| from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical | ||||
| from ..http_crawler import HttpCrawler, HttpCrawlerSection | ||||
| from .file_templates import Links | ||||
| from .ilias_html_cleaner import clean, insert_base_markup | ||||
| from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, | ||||
|                              _sanitize_path_name, parse_ilias_forum_export) | ||||
|  | ||||
| TargetType = Union[str, int] | ||||
|  | ||||
| _ILIAS_URL = "https://ilias.studium.kit.edu" | ||||
|  | ||||
|  | ||||
| class KitShibbolethBackgroundLoginSuccessful(): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class KitIliasWebCrawlerSection(HttpCrawlerSection): | ||||
|     def target(self) -> TargetType: | ||||
|         target = self.s.get("target") | ||||
|         if not target: | ||||
|             self.missing_value("target") | ||||
|  | ||||
|         if re.fullmatch(r"\d+", target): | ||||
|             # Course id | ||||
|             return int(target) | ||||
|         if target == "desktop": | ||||
|             # Full personal desktop | ||||
|             return target | ||||
|         if target.startswith(_ILIAS_URL): | ||||
|             # ILIAS URL | ||||
|             return target | ||||
|  | ||||
|         self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>") | ||||
|  | ||||
|     def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: | ||||
|         value: Optional[str] = self.s.get("tfa_auth") | ||||
|         if value is None: | ||||
|             return None | ||||
|         auth = authenticators.get(value) | ||||
|         if auth is None: | ||||
|             self.invalid_value("tfa_auth", value, "No such auth section exists") | ||||
|         return auth | ||||
|  | ||||
|     def links(self) -> Links: | ||||
|         type_str: Optional[str] = self.s.get("links") | ||||
|  | ||||
|         if type_str is None: | ||||
|             return Links.FANCY | ||||
|  | ||||
|         try: | ||||
|             return Links.from_string(type_str) | ||||
|         except ValueError as e: | ||||
|             self.invalid_value("links", type_str, str(e).capitalize()) | ||||
|  | ||||
|     def link_redirect_delay(self) -> int: | ||||
|         return self.s.getint("link_redirect_delay", fallback=-1) | ||||
|  | ||||
|     def videos(self) -> bool: | ||||
|         return self.s.getboolean("videos", fallback=False) | ||||
|  | ||||
|     def forums(self) -> bool: | ||||
|         return self.s.getboolean("forums", fallback=False) | ||||
|  | ||||
|  | ||||
| _DIRECTORY_PAGES: Set[IliasElementType] = set([ | ||||
|     IliasElementType.EXERCISE, | ||||
|     IliasElementType.EXERCISE_FILES, | ||||
|     IliasElementType.FOLDER, | ||||
|     IliasElementType.MEETING, | ||||
|     IliasElementType.VIDEO_FOLDER, | ||||
|     IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, | ||||
| ]) | ||||
|  | ||||
| _VIDEO_ELEMENTS: Set[IliasElementType] = set([ | ||||
|     IliasElementType.VIDEO, | ||||
|     IliasElementType.VIDEO_PLAYER, | ||||
|     IliasElementType.VIDEO_FOLDER, | ||||
|     IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, | ||||
| ]) | ||||
|  | ||||
|  | ||||
| def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | ||||
|     def decorator(f: AWrapped) -> AWrapped: | ||||
|         async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||
|             last_exception: Optional[BaseException] = None | ||||
|             for round in range(attempts): | ||||
|                 try: | ||||
|                     return await f(*args, **kwargs) | ||||
|                 except aiohttp.ContentTypeError:  # invalid content type | ||||
|                     raise CrawlWarning("ILIAS returned an invalid content type") | ||||
|                 except aiohttp.TooManyRedirects: | ||||
|                     raise CrawlWarning("Got stuck in a redirect loop") | ||||
|                 except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes | ||||
|                     last_exception = e | ||||
|                 except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc. | ||||
|                     last_exception = e | ||||
|                 except asyncio.exceptions.TimeoutError as e:  # explicit http timeouts in HttpCrawler | ||||
|                     last_exception = e | ||||
|                 log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") | ||||
|  | ||||
|             if last_exception: | ||||
|                 message = f"Error in I/O Operation: {last_exception}" | ||||
|                 if failure_is_error: | ||||
|                     raise CrawlError(message) from last_exception | ||||
|                 else: | ||||
|                     raise CrawlWarning(message) from last_exception | ||||
|             raise CrawlError("Impossible return in ilias _iorepeat") | ||||
|  | ||||
|         return wrapper  # type: ignore | ||||
|     return decorator | ||||
|  | ||||
|  | ||||
| def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]: | ||||
|     """ | ||||
|     Wraps any I/O exception in a CrawlWarning. | ||||
|     """ | ||||
|     return _iorepeat(1, name) | ||||
|  | ||||
|  | ||||
| # Crawler control flow: | ||||
| # | ||||
| #     crawl_desktop -+ | ||||
| #                    | | ||||
| #     crawl_course --+ | ||||
| #                    | | ||||
| #     @_io_repeat    |        # retries internally (before the bar) | ||||
| #  +- crawl_url    <-+ | ||||
| #  | | ||||
| #  | | ||||
| #  |  @_wrap_io_exception     # does not need to retry as children acquire bars | ||||
| #  +> crawl_ilias_element -+ | ||||
| #  ^                       | | ||||
| #  |  @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- crawl_ilias_page <---+ | ||||
| #  |                       | | ||||
| #  +> get_page             |  # Handles and retries authentication | ||||
| #                          | | ||||
| #     @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- download_link    <---+ | ||||
| #  |                       | | ||||
| #  +> resolve_target       |  # Handles and retries authentication | ||||
| #                          | | ||||
| #     @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- download_video   <---+ | ||||
| #  |                       | | ||||
| #  |  @_io_repeat          |  # retries internally (before the bar) | ||||
| #  +- download_file    <---+ | ||||
| #  | | ||||
| #  +> stream_from_url         # Handles and retries authentication | ||||
|  | ||||
| class KitIliasWebCrawler(HttpCrawler): | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: KitIliasWebCrawlerSection, | ||||
|             config: Config, | ||||
|             authenticators: Dict[str, Authenticator] | ||||
|     ): | ||||
|         # Setting a main authenticator for cookie sharing | ||||
|         auth = section.auth(authenticators) | ||||
|         super().__init__(name, section, config, shared_auth=auth) | ||||
|  | ||||
|         if section.tasks() > 1: | ||||
|             log.warn(""" | ||||
| Please avoid using too many parallel requests as these are the KIT ILIAS | ||||
| instance's greatest bottleneck. | ||||
|             """.strip()) | ||||
|  | ||||
|         self._shibboleth_login = KitShibbolethLogin( | ||||
|             auth, | ||||
|             section.tfa_auth(authenticators), | ||||
|         ) | ||||
|  | ||||
|         self._base_url = _ILIAS_URL | ||||
|  | ||||
|         self._target = section.target() | ||||
|         self._link_file_redirect_delay = section.link_redirect_delay() | ||||
|         self._links = section.links() | ||||
|         self._videos = section.videos() | ||||
|         self._forums = section.forums() | ||||
|         self._visited_urls: Set[str] = set() | ||||
|  | ||||
|     async def _run(self) -> None: | ||||
|         if isinstance(self._target, int): | ||||
|             log.explain_topic(f"Inferred crawl target: Course with id {self._target}") | ||||
|             await self._crawl_course(self._target) | ||||
|         elif self._target == "desktop": | ||||
|             log.explain_topic("Inferred crawl target: Personal desktop") | ||||
|             await self._crawl_desktop() | ||||
|         else: | ||||
|             log.explain_topic(f"Inferred crawl target: URL {self._target}") | ||||
|             await self._crawl_url(self._target) | ||||
|  | ||||
|     async def _crawl_course(self, course_id: int) -> None: | ||||
|         # Start crawling at the given course | ||||
|         root_url = url_set_query_param( | ||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" | ||||
|         ) | ||||
|  | ||||
|         await self._crawl_url(root_url, expected_id=course_id) | ||||
|  | ||||
|     async def _crawl_desktop(self) -> None: | ||||
|         appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" | ||||
|         appendix = appendix.encode("ASCII").hex() | ||||
|         await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) | ||||
|  | ||||
|     async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: | ||||
|         maybe_cl = await self.crawl(PurePath(".")) | ||||
|         if not maybe_cl: | ||||
|             return | ||||
|         cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 | ||||
|  | ||||
|         elements: List[IliasPageElement] = [] | ||||
|         # A list as variable redefinitions are not propagated to outer scopes | ||||
|         description: List[BeautifulSoup] = [] | ||||
|  | ||||
|         @_iorepeat(3, "crawling url") | ||||
|         async def gather_elements() -> None: | ||||
|             elements.clear() | ||||
|             async with cl: | ||||
|                 next_stage_url: Optional[str] = url | ||||
|                 current_parent = None | ||||
|  | ||||
|                 # Duplicated code, but the root page is special - we want to avoid fetching it twice! | ||||
|                 while next_stage_url: | ||||
|                     soup = await self._get_page(next_stage_url) | ||||
|  | ||||
|                     if current_parent is None and expected_id is not None: | ||||
|                         perma_link_element: Tag = soup.find(id="current_perma_link") | ||||
|                         if not perma_link_element or "crs_" not in perma_link_element.get("value"): | ||||
|                             raise CrawlError("Invalid course id? Didn't find anything looking like a course") | ||||
|  | ||||
|                     log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||
|                     log.explain(f"URL: {next_stage_url}") | ||||
|                     page = IliasPage(soup, next_stage_url, current_parent) | ||||
|                     if next_element := page.get_next_stage_element(): | ||||
|                         current_parent = next_element | ||||
|                         next_stage_url = next_element.url | ||||
|                     else: | ||||
|                         next_stage_url = None | ||||
|  | ||||
|                 elements.extend(page.get_child_elements()) | ||||
|                 if description_string := page.get_description(): | ||||
|                     description.append(description_string) | ||||
|  | ||||
|         # Fill up our task list with the found elements | ||||
|         await gather_elements() | ||||
|  | ||||
|         if description: | ||||
|             await self._download_description(PurePath("."), description[0]) | ||||
|  | ||||
|         elements.sort(key=lambda e: e.id()) | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|         for element in elements: | ||||
|             if handle := await self._handle_ilias_element(PurePath("."), element): | ||||
|                 tasks.append(asyncio.create_task(handle)) | ||||
|  | ||||
|         # And execute them | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _handle_ilias_page( | ||||
|         self, | ||||
|         url: str, | ||||
|         parent: IliasPageElement, | ||||
|         path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         maybe_cl = await self.crawl(path) | ||||
|         if not maybe_cl: | ||||
|             return None | ||||
|         return self._crawl_ilias_page(url, parent, maybe_cl) | ||||
|  | ||||
|     @anoncritical | ||||
|     async def _crawl_ilias_page( | ||||
|         self, | ||||
|         url: str, | ||||
|         parent: IliasPageElement, | ||||
|         cl: CrawlToken, | ||||
|     ) -> None: | ||||
|         elements: List[IliasPageElement] = [] | ||||
|         # A list as variable redefinitions are not propagated to outer scopes | ||||
|         description: List[BeautifulSoup] = [] | ||||
|  | ||||
|         @_iorepeat(3, "crawling folder") | ||||
|         async def gather_elements() -> None: | ||||
|             elements.clear() | ||||
|             async with cl: | ||||
|                 next_stage_url: Optional[str] = url | ||||
|                 current_parent = parent | ||||
|  | ||||
|                 while next_stage_url: | ||||
|                     soup = await self._get_page(next_stage_url) | ||||
|                     log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||
|                     log.explain(f"URL: {next_stage_url}") | ||||
|                     page = IliasPage(soup, next_stage_url, current_parent) | ||||
|                     if next_element := page.get_next_stage_element(): | ||||
|                         current_parent = next_element | ||||
|                         next_stage_url = next_element.url | ||||
|                     else: | ||||
|                         next_stage_url = None | ||||
|  | ||||
|                 elements.extend(page.get_child_elements()) | ||||
|                 if description_string := page.get_description(): | ||||
|                     description.append(description_string) | ||||
|  | ||||
|         # Fill up our task list with the found elements | ||||
|         await gather_elements() | ||||
|  | ||||
|         if description: | ||||
|             await self._download_description(cl.path, description[0]) | ||||
|  | ||||
|         elements.sort(key=lambda e: e.id()) | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|         for element in elements: | ||||
|             if handle := await self._handle_ilias_element(cl.path, element): | ||||
|                 tasks.append(asyncio.create_task(handle)) | ||||
|  | ||||
|         # And execute them | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     # These decorators only apply *to this method* and *NOT* to the returned | ||||
|     # awaitables! | ||||
|     # This method does not await the handlers but returns them instead. | ||||
|     # This ensures one level is handled at a time and name deduplication | ||||
|     # works correctly. | ||||
|     @anoncritical | ||||
|     async def _handle_ilias_element( | ||||
|         self, | ||||
|         parent_path: PurePath, | ||||
|         element: IliasPageElement, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         if element.url in self._visited_urls: | ||||
|             raise CrawlWarning( | ||||
|                 f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" | ||||
|             ) | ||||
|         self._visited_urls.add(element.url) | ||||
|  | ||||
|         element_path = PurePath(parent_path, element.name) | ||||
|  | ||||
|         if element.type in _VIDEO_ELEMENTS: | ||||
|             if not self._videos: | ||||
|                 log.status( | ||||
|                     "[bold bright_black]", | ||||
|                     "Ignored", | ||||
|                     fmt_path(element_path), | ||||
|                     "[bright_black](enable with option 'videos')" | ||||
|                 ) | ||||
|                 return None | ||||
|  | ||||
|         if element.type == IliasElementType.FILE: | ||||
|             return await self._handle_file(element, element_path) | ||||
|         elif element.type == IliasElementType.FORUM: | ||||
|             if not self._forums: | ||||
|                 log.status( | ||||
|                     "[bold bright_black]", | ||||
|                     "Ignored", | ||||
|                     fmt_path(element_path), | ||||
|                     "[bright_black](enable with option 'forums')" | ||||
|                 ) | ||||
|                 return None | ||||
|             return await self._handle_forum(element, element_path) | ||||
|         elif element.type == IliasElementType.TEST: | ||||
|             log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") | ||||
|             log.explain("Tests contain no relevant files") | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|         elif element.type == IliasElementType.LINK: | ||||
|             return await self._handle_link(element, element_path) | ||||
|         elif element.type == IliasElementType.BOOKING: | ||||
|             return await self._handle_booking(element, element_path) | ||||
|         elif element.type == IliasElementType.VIDEO: | ||||
|             return await self._handle_file(element, element_path) | ||||
|         elif element.type == IliasElementType.VIDEO_PLAYER: | ||||
|             return await self._handle_video(element, element_path) | ||||
|         elif element.type in _DIRECTORY_PAGES: | ||||
|             return await self._handle_ilias_page(element.url, element, element_path) | ||||
|         else: | ||||
|             # This will retry it a few times, failing everytime. It doesn't make any network | ||||
|             # requests, so that's fine. | ||||
|             raise CrawlWarning(f"Unknown element type: {element.type!r}") | ||||
|  | ||||
|     async def _handle_link( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") | ||||
|         log.explain(f"Links type is {self._links}") | ||||
|  | ||||
|         link_template_maybe = self._links.template() | ||||
|         link_extension = self._links.extension() | ||||
|         if not link_template_maybe or not link_extension: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|         else: | ||||
|             log.explain("Answer: Yes") | ||||
|         element_path = element_path.with_name(element_path.name + link_extension) | ||||
|  | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return None | ||||
|  | ||||
|         return self._download_link(element, link_template_maybe, maybe_dl) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "resolving link") | ||||
|     async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: | ||||
|         async with dl as (bar, sink): | ||||
|             export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") | ||||
|             real_url = await self._resolve_link_target(export_url) | ||||
|             self._write_link_content(link_template, real_url, element.name, element.description, sink) | ||||
|  | ||||
|     def _write_link_content( | ||||
|         self, | ||||
|         link_template: str, | ||||
|         url: str, | ||||
|         name: str, | ||||
|         description: Optional[str], | ||||
|         sink: FileSink, | ||||
|     ) -> None: | ||||
|         content = link_template | ||||
|         content = content.replace("{{link}}", url) | ||||
|         content = content.replace("{{name}}", name) | ||||
|         content = content.replace("{{description}}", str(description)) | ||||
|         content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) | ||||
|         sink.file.write(content.encode("utf-8")) | ||||
|         sink.done() | ||||
|  | ||||
|     async def _handle_booking( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}") | ||||
|         log.explain(f"Links type is {self._links}") | ||||
|  | ||||
|         link_template_maybe = self._links.template() | ||||
|         link_extension = self._links.extension() | ||||
|         if not link_template_maybe or not link_extension: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|         else: | ||||
|             log.explain("Answer: Yes") | ||||
|         element_path = element_path.with_name(element_path.name + link_extension) | ||||
|  | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return None | ||||
|  | ||||
|         return self._download_booking(element, link_template_maybe, maybe_dl) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(1, "downloading description") | ||||
|     async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None: | ||||
|         path = parent_path / "Description.html" | ||||
|         dl = await self.download(path, redownload=Redownload.ALWAYS) | ||||
|         if not dl: | ||||
|             return | ||||
|  | ||||
|         async with dl as (bar, sink): | ||||
|             description = clean(insert_base_markup(description)) | ||||
|             sink.file.write(description.prettify().encode("utf-8")) | ||||
|             sink.done() | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "resolving booking") | ||||
|     async def _download_booking( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         link_template: str, | ||||
|         dl: DownloadToken, | ||||
|     ) -> None: | ||||
|         async with dl as (bar, sink): | ||||
|             self._write_link_content(link_template, element.url, element.name, element.description, sink) | ||||
|  | ||||
|     async def _resolve_link_target(self, export_url: str) -> str: | ||||
|         async with self.session.get(export_url, allow_redirects=False) as resp: | ||||
|             # No redirect means we were authenticated | ||||
|             if hdrs.LOCATION not in resp.headers: | ||||
|                 return soupify(await resp.read()).select_one("a").get("href").strip() | ||||
|  | ||||
|         await self._authenticate() | ||||
|  | ||||
|         async with self.session.get(export_url, allow_redirects=False) as resp: | ||||
|             # No redirect means we were authenticated | ||||
|             if hdrs.LOCATION not in resp.headers: | ||||
|                 return soupify(await resp.read()).select_one("a").get("href").strip() | ||||
|  | ||||
|         raise CrawlError("resolve_link_target failed even after authenticating") | ||||
|  | ||||
|     async def _handle_video( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         # Copy old mapping as it is likely still relevant | ||||
|         if self.prev_report: | ||||
|             self.report.add_custom_value( | ||||
|                 str(element_path), | ||||
|                 self.prev_report.get_custom_value(str(element_path)) | ||||
|             ) | ||||
|  | ||||
|         # A video might contain other videos, so let's "crawl" the video first | ||||
|         # to ensure rate limits apply. This must be a download as *this token* | ||||
|         # is re-used if the video consists of a single stream. In that case the | ||||
|         # file name is used and *not* the stream name the ilias html parser reported | ||||
|         # to ensure backwards compatibility. | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) | ||||
|  | ||||
|         # If we do not want to crawl it (user filter) or we have every file | ||||
|         # from the cached mapping already, we can ignore this and bail | ||||
|         if not maybe_dl or self._all_videos_locally_present(element_path): | ||||
|             # Mark all existing cideos as known so they do not get deleted | ||||
|             # during dleanup. We "downloaded" them, just without actually making | ||||
|             # a network request as we assumed they did not change. | ||||
|             for video in self._previous_contained_videos(element_path): | ||||
|                 await self.download(video) | ||||
|  | ||||
|             return None | ||||
|  | ||||
|         return self._download_video(element_path, element, maybe_dl) | ||||
|  | ||||
|     def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]: | ||||
|         if not self.prev_report: | ||||
|             return [] | ||||
|         custom_value = self.prev_report.get_custom_value(str(video_path)) | ||||
|         if not custom_value: | ||||
|             return [] | ||||
|         names = cast(List[str], custom_value) | ||||
|         folder = video_path.parent | ||||
|         return [PurePath(folder, name) for name in names] | ||||
|  | ||||
|     def _all_videos_locally_present(self, video_path: PurePath) -> bool: | ||||
|         if contained_videos := self._previous_contained_videos(video_path): | ||||
|             log.explain_topic(f"Checking local cache for video {video_path.name}") | ||||
|             all_found_locally = True | ||||
|             for video in contained_videos: | ||||
|                 transformed_path = self._to_local_video_path(video) | ||||
|                 if transformed_path: | ||||
|                     exists_locally = self._output_dir.resolve(transformed_path).exists() | ||||
|                     all_found_locally = all_found_locally and exists_locally | ||||
|             if all_found_locally: | ||||
|                 log.explain("Found all videos locally, skipping enumeration request") | ||||
|                 return True | ||||
|             log.explain("Missing at least one video, continuing with requests!") | ||||
|         return False | ||||
|  | ||||
|     def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]: | ||||
|         if transformed := self._transformer.transform(path): | ||||
|             return self._deduplicator.fixup_path(transformed) | ||||
|         return None | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "downloading video") | ||||
|     async def _download_video( | ||||
|         self, | ||||
|         original_path: PurePath, | ||||
|         element: IliasPageElement, | ||||
|         dl: DownloadToken | ||||
|     ) -> None: | ||||
|         stream_elements: List[IliasPageElement] = [] | ||||
|         async with dl as (bar, sink): | ||||
|             page = IliasPage(await self._get_page(element.url), element.url, element) | ||||
|             stream_elements = page.get_child_elements() | ||||
|  | ||||
|             if len(stream_elements) > 1: | ||||
|                 log.explain(f"Found multiple video streams for {element.name}") | ||||
|             else: | ||||
|                 log.explain(f"Using single video mode for {element.name}") | ||||
|                 stream_element = stream_elements[0] | ||||
|  | ||||
|                 transformed_path = self._to_local_video_path(original_path) | ||||
|                 if not transformed_path: | ||||
|                     raise CrawlError(f"Download returned a path but transform did not for {original_path}") | ||||
|  | ||||
|                 # We do not have a local cache yet | ||||
|                 if self._output_dir.resolve(transformed_path).exists(): | ||||
|                     log.explain(f"Video for {element.name} existed locally") | ||||
|                 else: | ||||
|                     await self._stream_from_url(stream_element.url, sink, bar, is_video=True) | ||||
|                 self.report.add_custom_value(str(original_path), [original_path.name]) | ||||
|                 return | ||||
|  | ||||
|         contained_video_paths: List[str] = [] | ||||
|  | ||||
|         for stream_element in stream_elements: | ||||
|             video_path = original_path.parent / stream_element.name | ||||
|             contained_video_paths.append(str(video_path)) | ||||
|  | ||||
|             maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) | ||||
|             if not maybe_dl: | ||||
|                 continue | ||||
|             async with maybe_dl as (bar, sink): | ||||
|                 log.explain(f"Streaming video from real url {stream_element.url}") | ||||
|                 await self._stream_from_url(stream_element.url, sink, bar, is_video=True) | ||||
|  | ||||
|         self.report.add_custom_value(str(original_path), contained_video_paths) | ||||
|  | ||||
|     async def _handle_file( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return None | ||||
|         return self._download_file(element, maybe_dl) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "downloading file") | ||||
|     async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: | ||||
|         assert dl  # The function is only reached when dl is not None | ||||
|         async with dl as (bar, sink): | ||||
|             await self._stream_from_url(element.url, sink, bar, is_video=False) | ||||
|  | ||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: | ||||
|         async def try_stream() -> bool: | ||||
|             async with self.session.get(url, allow_redirects=is_video) as resp: | ||||
|                 if not is_video: | ||||
|                     # Redirect means we weren't authenticated | ||||
|                     if hdrs.LOCATION in resp.headers: | ||||
|                         return False | ||||
|                 # we wanted a video but got HTML | ||||
|                 if is_video and "html" in resp.content_type: | ||||
|                     return False | ||||
|  | ||||
|                 if resp.content_length: | ||||
|                     bar.set_total(resp.content_length) | ||||
|  | ||||
|                 async for data in resp.content.iter_chunked(1024): | ||||
|                     sink.file.write(data) | ||||
|                     bar.advance(len(data)) | ||||
|  | ||||
|                 sink.done() | ||||
|             return True | ||||
|  | ||||
|         auth_id = await self._current_auth_id() | ||||
|         if await try_stream(): | ||||
|             return | ||||
|  | ||||
|         await self.authenticate(auth_id) | ||||
|  | ||||
|         if not await try_stream(): | ||||
|             raise CrawlError("File streaming failed after authenticate()") | ||||
|  | ||||
|     async def _handle_forum( | ||||
|         self, | ||||
|         element: IliasPageElement, | ||||
|         element_path: PurePath, | ||||
|     ) -> Optional[Coroutine[Any, Any, None]]: | ||||
|         maybe_cl = await self.crawl(element_path) | ||||
|         if not maybe_cl: | ||||
|             return None | ||||
|         return self._crawl_forum(element, maybe_cl) | ||||
|  | ||||
|     @_iorepeat(3, "crawling forum") | ||||
|     @anoncritical | ||||
|     async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: | ||||
|         elements: List[IliasForumThread] = [] | ||||
|  | ||||
|         async with cl: | ||||
|             next_stage_url = element.url | ||||
|             while next_stage_url: | ||||
|                 log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||
|                 log.explain(f"URL: {next_stage_url}") | ||||
|  | ||||
|                 soup = await self._get_page(next_stage_url) | ||||
|                 page = IliasPage(soup, next_stage_url, None) | ||||
|  | ||||
|                 if next := page.get_next_stage_element(): | ||||
|                     next_stage_url = next.url | ||||
|                 else: | ||||
|                     break | ||||
|  | ||||
|             download_data = page.get_download_forum_data() | ||||
|             if not download_data: | ||||
|                 raise CrawlWarning("Failed to extract forum data") | ||||
|             if download_data.empty: | ||||
|                 log.explain("Forum had no threads") | ||||
|                 elements = [] | ||||
|                 return | ||||
|             html = await self._post_authenticated(download_data.url, download_data.form_data) | ||||
|             elements = parse_ilias_forum_export(soupify(html)) | ||||
|  | ||||
|         elements.sort(key=lambda elem: elem.title) | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|         for elem in elements: | ||||
|             tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) | ||||
|  | ||||
|         # And execute them | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     @anoncritical | ||||
|     @_iorepeat(3, "saving forum thread") | ||||
|     async def _download_forum_thread( | ||||
|         self, | ||||
|         parent_path: PurePath, | ||||
|         element: IliasForumThread, | ||||
|     ) -> None: | ||||
|         path = parent_path / (_sanitize_path_name(element.title) + ".html") | ||||
|         maybe_dl = await self.download(path, mtime=element.mtime) | ||||
|         if not maybe_dl: | ||||
|             return | ||||
|  | ||||
|         async with maybe_dl as (bar, sink): | ||||
|             content = element.title_tag.prettify() | ||||
|             content += element.content_tag.prettify() | ||||
|             sink.file.write(content.encode("utf-8")) | ||||
|             sink.done() | ||||
|  | ||||
|     async def _get_page(self, url: str) -> BeautifulSoup: | ||||
|         auth_id = await self._current_auth_id() | ||||
|         async with self.session.get(url) as request: | ||||
|             soup = soupify(await request.read()) | ||||
|             if self._is_logged_in(soup): | ||||
|                 return soup | ||||
|  | ||||
|         # We weren't authenticated, so try to do that | ||||
|         await self.authenticate(auth_id) | ||||
|  | ||||
|         # Retry once after authenticating. If this fails, we will die. | ||||
|         async with self.session.get(url) as request: | ||||
|             soup = soupify(await request.read()) | ||||
|             if self._is_logged_in(soup): | ||||
|                 return soup | ||||
|         raise CrawlError("get_page failed even after authenticating") | ||||
|  | ||||
|     async def _post_authenticated( | ||||
|         self, | ||||
|         url: str, | ||||
|         data: dict[str, Union[str, List[str]]] | ||||
|     ) -> BeautifulSoup: | ||||
|         auth_id = await self._current_auth_id() | ||||
|  | ||||
|         form_data = aiohttp.FormData() | ||||
|         for key, val in data.items(): | ||||
|             form_data.add_field(key, val) | ||||
|  | ||||
|         async with self.session.post(url, data=form_data(), allow_redirects=False) as request: | ||||
|             if request.status == 200: | ||||
|                 return await request.read() | ||||
|  | ||||
|         # We weren't authenticated, so try to do that | ||||
|         await self.authenticate(auth_id) | ||||
|  | ||||
|         # Retry once after authenticating. If this fails, we will die. | ||||
|         async with self.session.post(url, data=data, allow_redirects=False) as request: | ||||
|             if request.status == 200: | ||||
|                 return await request.read() | ||||
|         raise CrawlError("post_authenticated failed even after authenticating") | ||||
|  | ||||
|     # We repeat this as the login method in shibboleth doesn't handle I/O errors. | ||||
|     # Shibboleth is quite reliable as well, the repeat is likely not critical here. | ||||
|     @ _iorepeat(3, "Login", failure_is_error=True) | ||||
|     async def _authenticate(self) -> None: | ||||
|         await self._shibboleth_login.login(self.session) | ||||
|  | ||||
|     @ staticmethod | ||||
|     def _is_logged_in(soup: BeautifulSoup) -> bool: | ||||
|         # Normal ILIAS pages | ||||
|         mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") | ||||
|         if mainbar is not None: | ||||
|             login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) | ||||
|             shib_login = soup.find(id="button_shib_login") | ||||
|             return not login_button and not shib_login | ||||
|  | ||||
|         # Personal Desktop | ||||
|         if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): | ||||
|             return True | ||||
|  | ||||
|         # Video listing embeds do not have complete ILIAS html. Try to match them by | ||||
|         # their video listing table | ||||
|         video_table = soup.find( | ||||
|             recursive=True, | ||||
|             name="table", | ||||
|             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} | ||||
|         ) | ||||
|         if video_table is not None: | ||||
|             return True | ||||
|         # The individual video player wrapper page has nothing of the above. | ||||
|         # Match it by its playerContainer. | ||||
|         if soup.select_one("#playerContainer") is not None: | ||||
|             return True | ||||
|         return False | ||||
|  | ||||
|  | ||||
| class KitShibbolethLogin: | ||||
|     """ | ||||
|     Login via KIT's shibboleth system. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator]) -> None: | ||||
|         self._auth = authenticator | ||||
|         self._tfa_auth = tfa_authenticator | ||||
|  | ||||
|     async def login(self, sess: aiohttp.ClientSession) -> None: | ||||
|         """ | ||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||
|         cookies it receieves. | ||||
|  | ||||
|         This function should only be called whenever it is detected that you're | ||||
|         not logged in. The cookies obtained should be good for a few minutes, | ||||
|         maybe even an hour or two. | ||||
|         """ | ||||
|  | ||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||
|         # https://ilias.studium.kit.edu/login.php | ||||
|         url = f"{_ILIAS_URL}/shib_login.php" | ||||
|         data = { | ||||
|             "sendLogin": "1", | ||||
|             "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", | ||||
|             "il_target": "", | ||||
|             "home_organization_selection": "Weiter", | ||||
|         } | ||||
|         soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data) | ||||
|  | ||||
|         if isinstance(soup, KitShibbolethBackgroundLoginSuccessful): | ||||
|             return | ||||
|  | ||||
|         # Attempt to login using credentials, if necessary | ||||
|         while not self._login_successful(soup): | ||||
|             # Searching the form here so that this fails before asking for | ||||
|             # credentials rather than after asking. | ||||
|             form = soup.find("form", {"class": "full content", "method": "post"}) | ||||
|             action = form["action"] | ||||
|  | ||||
|             csrf_token = form.find("input", {"name": "csrf_token"})["value"] | ||||
|  | ||||
|             # Equivalent: Enter credentials in | ||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|             url = "https://idp.scc.kit.edu" + action | ||||
|             username, password = await self._auth.credentials() | ||||
|             data = { | ||||
|                 "_eventId_proceed": "", | ||||
|                 "j_username": username, | ||||
|                 "j_password": password, | ||||
|                 "csrf_token": csrf_token | ||||
|             } | ||||
|             soup = await _post(sess, url, data) | ||||
|  | ||||
|             if soup.find(id="attributeRelease"): | ||||
|                 raise CrawlError( | ||||
|                     "ILIAS Shibboleth entitlements changed! " | ||||
|                     "Please log in once in your browser and review them" | ||||
|                 ) | ||||
|  | ||||
|             if self._tfa_required(soup): | ||||
|                 soup = await self._authenticate_tfa(sess, soup) | ||||
|  | ||||
|             if not self._login_successful(soup): | ||||
|                 self._auth.invalidate_credentials() | ||||
|  | ||||
|         # Equivalent: Being redirected via JS automatically | ||||
|         # (or clicking "Continue" if you have JS disabled) | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST" | ||||
|         data = {  # using the info obtained in the while loop above | ||||
|             "RelayState": relay_state["value"], | ||||
|             "SAMLResponse": saml_response["value"], | ||||
|         } | ||||
|         await sess.post(url, data=data) | ||||
|  | ||||
|     async def _authenticate_tfa( | ||||
|             self, | ||||
|             session: aiohttp.ClientSession, | ||||
|             soup: BeautifulSoup | ||||
|     ) -> BeautifulSoup: | ||||
|         if not self._tfa_auth: | ||||
|             self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") | ||||
|  | ||||
|         tfa_token = await self._tfa_auth.password() | ||||
|  | ||||
|         # Searching the form here so that this fails before asking for | ||||
|         # credentials rather than after asking. | ||||
|         form = soup.find("form", {"method": "post"}) | ||||
|         action = form["action"] | ||||
|         csrf_token = form.find("input", {"name": "csrf_token"})["value"] | ||||
|  | ||||
|         # Equivalent: Enter token in | ||||
|         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|         url = "https://idp.scc.kit.edu" + action | ||||
|         data = { | ||||
|             "_eventId_proceed": "", | ||||
|             "j_tokenNumber": tfa_token, | ||||
|             "csrf_token": csrf_token | ||||
|         } | ||||
|         return await _post(session, url, data) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _login_successful(soup: BeautifulSoup) -> bool: | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         return relay_state is not None and saml_response is not None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _tfa_required(soup: BeautifulSoup) -> bool: | ||||
|         return soup.find(id="j_tokenNumber") is not None | ||||
|  | ||||
|  | ||||
| async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: | ||||
|     async with session.post(url, data=data) as response: | ||||
|         return soupify(await response.read()) | ||||
|  | ||||
|  | ||||
| async def _shib_post( | ||||
|     session: aiohttp.ClientSession, | ||||
|     url: str, | ||||
|     data: Any | ||||
| ) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]: | ||||
|     """ | ||||
|     aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected | ||||
|     by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and | ||||
|     build encoded URL objects ourselves... Who thought mangling location header was a good idea?? | ||||
|     """ | ||||
|     log.explain_topic("Shib login POST") | ||||
|     async with session.post(url, data=data, allow_redirects=False) as response: | ||||
|         location = response.headers.get("location") | ||||
|         log.explain(f"Got location {location!r}") | ||||
|         if not location: | ||||
|             raise CrawlWarning(f"Login failed (1), no location header present at {url}") | ||||
|         correct_url = yarl.URL(location, encoded=True) | ||||
|         log.explain(f"Corrected location to {correct_url!r}") | ||||
|  | ||||
|         if str(correct_url).startswith(_ILIAS_URL): | ||||
|             log.explain("ILIAS recognized our shib token and logged us in in the background, returning") | ||||
|             return KitShibbolethBackgroundLoginSuccessful() | ||||
|  | ||||
|         async with session.get(correct_url, allow_redirects=False) as response: | ||||
|             location = response.headers.get("location") | ||||
|             log.explain(f"Redirected to {location!r} with status {response.status}") | ||||
|             # If shib still still has a valid session, it will directly respond to the request | ||||
|             if location is None: | ||||
|                 log.explain("Shib recognized us, returning its response directly") | ||||
|                 return soupify(await response.read()) | ||||
|  | ||||
|             as_yarl = yarl.URL(response.url) | ||||
|             # Probably not needed anymore, but might catch a few weird situations with a nicer message | ||||
|             if not location or not as_yarl.host: | ||||
|                 raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}") | ||||
|  | ||||
|             correct_url = yarl.URL.build( | ||||
|                 scheme=as_yarl.scheme, | ||||
|                 host=as_yarl.host, | ||||
|                 path=location, | ||||
|                 encoded=True | ||||
|             ) | ||||
|             log.explain(f"Corrected location to {correct_url!r}") | ||||
|  | ||||
|             async with session.get(correct_url, allow_redirects=False) as response: | ||||
|                 return soupify(await response.read()) | ||||
							
								
								
									
										170
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										170
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,170 @@ | ||||
| import os | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from pathlib import PurePath | ||||
| from typing import Awaitable, List, Optional, Pattern, Set, Union | ||||
| from urllib.parse import urljoin | ||||
|  | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from ..config import Config | ||||
| from ..logging import ProgressBar, log | ||||
| from ..output_dir import FileSink | ||||
| from ..utils import soupify | ||||
| from .crawler import CrawlError | ||||
| from .http_crawler import HttpCrawler, HttpCrawlerSection | ||||
|  | ||||
|  | ||||
| class KitIpdCrawlerSection(HttpCrawlerSection): | ||||
|     def target(self) -> str: | ||||
|         target = self.s.get("target") | ||||
|         if not target: | ||||
|             self.missing_value("target") | ||||
|  | ||||
|         if not target.startswith("https://"): | ||||
|             self.invalid_value("target", target, "Should be a URL") | ||||
|  | ||||
|         return target | ||||
|  | ||||
|     def link_regex(self) -> Pattern[str]: | ||||
|         regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") | ||||
|         return re.compile(regex) | ||||
|  | ||||
|  | ||||
| @dataclass(unsafe_hash=True) | ||||
| class KitIpdFile: | ||||
|     name: str | ||||
|     url: str | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class KitIpdFolder: | ||||
|     name: str | ||||
|     files: List[KitIpdFile] | ||||
|  | ||||
|     def explain(self) -> None: | ||||
|         log.explain_topic(f"Folder {self.name!r}") | ||||
|         for file in self.files: | ||||
|             log.explain(f"File {file.name!r} (href={file.url!r})") | ||||
|  | ||||
|     def __hash__(self) -> int: | ||||
|         return self.name.__hash__() | ||||
|  | ||||
|  | ||||
| class KitIpdCrawler(HttpCrawler): | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: KitIpdCrawlerSection, | ||||
|             config: Config, | ||||
|     ): | ||||
|         super().__init__(name, section, config) | ||||
|         self._url = section.target() | ||||
|         self._file_regex = section.link_regex() | ||||
|  | ||||
|     async def _run(self) -> None: | ||||
|         maybe_cl = await self.crawl(PurePath(".")) | ||||
|         if not maybe_cl: | ||||
|             return | ||||
|  | ||||
|         tasks: List[Awaitable[None]] = [] | ||||
|  | ||||
|         async with maybe_cl: | ||||
|             for item in await self._fetch_items(): | ||||
|                 if isinstance(item, KitIpdFolder): | ||||
|                     tasks.append(self._crawl_folder(item)) | ||||
|                 else: | ||||
|                     # Orphan files are placed in the root folder | ||||
|                     tasks.append(self._download_file(PurePath("."), item)) | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _crawl_folder(self, folder: KitIpdFolder) -> None: | ||||
|         path = PurePath(folder.name) | ||||
|         if not await self.crawl(path): | ||||
|             return | ||||
|  | ||||
|         tasks = [self._download_file(path, file) for file in folder.files] | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: | ||||
|         element_path = parent / file.name | ||||
|         maybe_dl = await self.download(element_path) | ||||
|         if not maybe_dl: | ||||
|             return | ||||
|  | ||||
|         async with maybe_dl as (bar, sink): | ||||
|             await self._stream_from_url(file.url, sink, bar) | ||||
|  | ||||
|     async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: | ||||
|         page = await self.get_page() | ||||
|         elements: List[Tag] = self._find_file_links(page) | ||||
|         items: Set[Union[KitIpdFile, KitIpdFolder]] = set() | ||||
|  | ||||
|         for element in elements: | ||||
|             folder_label = self._find_folder_label(element) | ||||
|             if folder_label: | ||||
|                 folder = self._extract_folder(folder_label) | ||||
|                 if folder not in items: | ||||
|                     items.add(folder) | ||||
|                     folder.explain() | ||||
|             else: | ||||
|                 file = self._extract_file(element) | ||||
|                 items.add(file) | ||||
|                 log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") | ||||
|                 log.explain("Attributing it to root folder") | ||||
|  | ||||
|         return items | ||||
|  | ||||
|     def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: | ||||
|         files: List[KitIpdFile] = [] | ||||
|         name = folder_tag.getText().strip() | ||||
|  | ||||
|         container: Tag = folder_tag.findNextSibling(name="table") | ||||
|         for link in self._find_file_links(container): | ||||
|             files.append(self._extract_file(link)) | ||||
|  | ||||
|         return KitIpdFolder(name, files) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_folder_label(file_link: Tag) -> Optional[Tag]: | ||||
|         enclosing_table: Tag = file_link.findParent(name="table") | ||||
|         if enclosing_table is None: | ||||
|             return None | ||||
|         return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) | ||||
|  | ||||
|     def _extract_file(self, link: Tag) -> KitIpdFile: | ||||
|         url = self._abs_url_from_link(link) | ||||
|         name = os.path.basename(url) | ||||
|         return KitIpdFile(name, url) | ||||
|  | ||||
|     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: | ||||
|         return tag.findAll(name="a", attrs={"href": self._file_regex}) | ||||
|  | ||||
|     def _abs_url_from_link(self, link_tag: Tag) -> str: | ||||
|         return urljoin(self._url, link_tag.get("href")) | ||||
|  | ||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: | ||||
|         async with self.session.get(url, allow_redirects=False) as resp: | ||||
|             if resp.status == 403: | ||||
|                 raise CrawlError("Received a 403. Are you within the KIT network/VPN?") | ||||
|             if resp.content_length: | ||||
|                 bar.set_total(resp.content_length) | ||||
|  | ||||
|             async for data in resp.content.iter_chunked(1024): | ||||
|                 sink.file.write(data) | ||||
|                 bar.advance(len(data)) | ||||
|  | ||||
|             sink.done() | ||||
|  | ||||
|     async def get_page(self) -> BeautifulSoup: | ||||
|         async with self.session.get(self._url) as request: | ||||
|             # The web page for Algorithmen für Routenplanung contains some | ||||
|             # weird comments that beautifulsoup doesn't parse correctly. This | ||||
|             # hack enables those pages to be crawled, and should hopefully not | ||||
|             # cause issues on other pages. | ||||
|             content = (await request.read()).decode("utf-8") | ||||
|             content = re.sub(r"<!--.*?-->", "", content) | ||||
|             return soupify(content.encode("utf-8")) | ||||
							
								
								
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,117 @@ | ||||
| import asyncio | ||||
| import datetime | ||||
| import random | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Optional | ||||
|  | ||||
| from ..config import Config | ||||
| from .crawler import Crawler, CrawlerSection, anoncritical | ||||
|  | ||||
|  | ||||
| class LocalCrawlerSection(CrawlerSection): | ||||
|     def target(self) -> Path: | ||||
|         value = self.s.get("target") | ||||
|         if value is None: | ||||
|             self.missing_value("target") | ||||
|         return Path(value).expanduser() | ||||
|  | ||||
|     def crawl_delay(self) -> float: | ||||
|         value = self.s.getfloat("crawl_delay", fallback=0.0) | ||||
|         if value < 0: | ||||
|             self.invalid_value("crawl_delay", value, | ||||
|                                "Must not be negative") | ||||
|         return value | ||||
|  | ||||
|     def download_delay(self) -> float: | ||||
|         value = self.s.getfloat("download_delay", fallback=0.0) | ||||
|         if value < 0: | ||||
|             self.invalid_value("download_delay", value, | ||||
|                                "Must not be negative") | ||||
|         return value | ||||
|  | ||||
|     def download_speed(self) -> Optional[int]: | ||||
|         value = self.s.getint("download_speed") | ||||
|         if value is not None and value <= 0: | ||||
|             self.invalid_value("download_speed", value, | ||||
|                                "Must be greater than 0") | ||||
|         return value | ||||
|  | ||||
|  | ||||
| class LocalCrawler(Crawler): | ||||
|     def __init__( | ||||
|             self, | ||||
|             name: str, | ||||
|             section: LocalCrawlerSection, | ||||
|             config: Config, | ||||
|     ): | ||||
|         super().__init__(name, section, config) | ||||
|  | ||||
|         self._target = config.default_section.working_dir() / section.target() | ||||
|         self._crawl_delay = section.crawl_delay() | ||||
|         self._download_delay = section.download_delay() | ||||
|         self._download_speed = section.download_speed() | ||||
|  | ||||
|         if self._download_speed: | ||||
|             self._block_size = self._download_speed // 10 | ||||
|         else: | ||||
|             self._block_size = 1024**2  # 1 MiB | ||||
|  | ||||
|     async def _run(self) -> None: | ||||
|         await self._crawl_path(self._target, PurePath()) | ||||
|  | ||||
|     @anoncritical | ||||
|     async def _crawl_path(self, path: Path, pure: PurePath) -> None: | ||||
|         if path.is_dir(): | ||||
|             await self._crawl_dir(path, pure) | ||||
|         elif path.is_file(): | ||||
|             await self._crawl_file(path, pure) | ||||
|  | ||||
|     async def _crawl_dir(self, path: Path, pure: PurePath) -> None: | ||||
|         cl = await self.crawl(pure) | ||||
|         if not cl: | ||||
|             return | ||||
|  | ||||
|         tasks = [] | ||||
|  | ||||
|         async with cl: | ||||
|             await asyncio.sleep(random.uniform( | ||||
|                 0.5 * self._crawl_delay, | ||||
|                 self._crawl_delay, | ||||
|             )) | ||||
|  | ||||
|             for child in path.iterdir(): | ||||
|                 pure_child = cl.path / child.name | ||||
|                 tasks.append(self._crawl_path(child, pure_child)) | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _crawl_file(self, path: Path, pure: PurePath) -> None: | ||||
|         stat = path.stat() | ||||
|         mtime = datetime.datetime.fromtimestamp(stat.st_mtime) | ||||
|         dl = await self.download(pure, mtime=mtime) | ||||
|         if not dl: | ||||
|             return | ||||
|  | ||||
|         async with dl as (bar, sink): | ||||
|             await asyncio.sleep(random.uniform( | ||||
|                 0.5 * self._download_delay, | ||||
|                 self._download_delay, | ||||
|             )) | ||||
|  | ||||
|             bar.set_total(stat.st_size) | ||||
|  | ||||
|             with open(path, "rb") as f: | ||||
|                 while True: | ||||
|                     data = f.read(self._block_size) | ||||
|                     if len(data) == 0: | ||||
|                         break | ||||
|  | ||||
|                     sink.file.write(data) | ||||
|                     bar.advance(len(data)) | ||||
|  | ||||
|                     if self._download_speed: | ||||
|                         delay = self._block_size / self._download_speed | ||||
|                         delay = random.uniform(0.8 * delay, 1.2 * delay) | ||||
|                         await asyncio.sleep(delay) | ||||
|  | ||||
|                 sink.done() | ||||
							
								
								
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,85 @@ | ||||
| from pathlib import PurePath | ||||
| from typing import Iterator, Set | ||||
|  | ||||
| from .logging import log | ||||
| from .utils import fmt_path | ||||
|  | ||||
|  | ||||
| def name_variants(path: PurePath) -> Iterator[PurePath]: | ||||
|     separator = " " if " " in path.stem else "_" | ||||
|     i = 1 | ||||
|     while True: | ||||
|         yield path.parent / f"{path.stem}{separator}{i}{path.suffix}" | ||||
|         i += 1 | ||||
|  | ||||
|  | ||||
| class Deduplicator: | ||||
|     FORBIDDEN_CHARS = '<>:"/\\|?*' | ||||
|     FORBIDDEN_NAMES = { | ||||
|         "CON", "PRN", "AUX", "NUL", | ||||
|         "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", | ||||
|         "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", | ||||
|     } | ||||
|  | ||||
|     def __init__(self, windows_paths: bool) -> None: | ||||
|         self._windows_paths = windows_paths | ||||
|  | ||||
|         self._known: Set[PurePath] = set() | ||||
|  | ||||
|     def _add(self, path: PurePath) -> None: | ||||
|         self._known.add(path) | ||||
|  | ||||
|         # The last parent is just "." | ||||
|         for parent in list(path.parents)[:-1]: | ||||
|             self._known.add(parent) | ||||
|  | ||||
|     def _fixup_element(self, name: str) -> str: | ||||
|         # For historical reasons, windows paths have some odd restrictions that | ||||
|         # we're trying to avoid. See: | ||||
|         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file | ||||
|  | ||||
|         for char in self.FORBIDDEN_CHARS: | ||||
|             name = name.replace(char, "_") | ||||
|  | ||||
|         path = PurePath(name) | ||||
|         if path.stem in self.FORBIDDEN_NAMES: | ||||
|             name = f"{path.stem}_{path.suffix}" | ||||
|  | ||||
|         if name.endswith(" ") or name.endswith("."): | ||||
|             name += "_" | ||||
|  | ||||
|         return name | ||||
|  | ||||
|     def _fixup_for_windows(self, path: PurePath) -> PurePath: | ||||
|         new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts]) | ||||
|         if new_path != path: | ||||
|             log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") | ||||
|         return new_path | ||||
|  | ||||
|     def fixup_path(self, path: PurePath) -> PurePath: | ||||
|         """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise.""" | ||||
|         if self._windows_paths: | ||||
|             return self._fixup_for_windows(path) | ||||
|         return path | ||||
|  | ||||
|     def mark(self, path: PurePath) -> PurePath: | ||||
|         if self._windows_paths: | ||||
|             path = self._fixup_for_windows(path) | ||||
|  | ||||
|         if path not in self._known: | ||||
|             self._add(path) | ||||
|             return path | ||||
|  | ||||
|         log.explain(f"Path {fmt_path(path)} is already taken, finding a new name") | ||||
|  | ||||
|         for variant in name_variants(path): | ||||
|             if variant in self._known: | ||||
|                 log.explain(f"Path {fmt_path(variant)} is taken as well") | ||||
|                 continue | ||||
|  | ||||
|             log.explain(f"Found unused path {fmt_path(variant)}") | ||||
|             self._add(variant) | ||||
|             return variant | ||||
|  | ||||
|         # The "name_variants" iterator returns infinitely many paths | ||||
|         raise RuntimeError("Unreachable") | ||||
							
								
								
									
										61
									
								
								PFERD/ffm.py
									
									
									
									
									
								
							
							
						
						
									
										61
									
								
								PFERD/ffm.py
									
									
									
									
									
								
							| @@ -1,61 +0,0 @@ | ||||
| # Fakultät für Mathematik (FfM) | ||||
|  | ||||
| import logging | ||||
| import pathlib | ||||
| import re | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from .organizer import Organizer | ||||
| from .utils import stream_to_path, PrettyLogger | ||||
|  | ||||
| __all__ = ["FfM"] | ||||
| logger = logging.getLogger(__name__) | ||||
| pretty = PrettyLogger(logger) | ||||
|  | ||||
| class FfM: | ||||
|     BASE_URL = "http://www.math.kit.edu/" | ||||
|     LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") | ||||
|  | ||||
|     def __init__(self, base_path): | ||||
|         self.base_path = base_path | ||||
|  | ||||
|         self._session = requests.Session() | ||||
|  | ||||
|     def synchronize(self, urlpart, to_dir, transform=lambda x: x): | ||||
|         pretty.starting_synchronizer(to_dir, "FfM", urlpart) | ||||
|  | ||||
|         sync_path = pathlib.Path(self.base_path, to_dir) | ||||
|  | ||||
|         orga = Organizer(self.base_path, sync_path) | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|         self._crawl(orga, urlpart, transform) | ||||
|  | ||||
|         orga.clean_sync_dir() | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|     def _crawl(self, orga, urlpart, transform): | ||||
|         url = self.BASE_URL + urlpart | ||||
|         r = self._session.get(url) | ||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|  | ||||
|         for found in soup.find_all("a", href=self.LINK_RE): | ||||
|             url = found["href"] | ||||
|             filename = re.match(self.LINK_RE, url).group(1).replace("/", ".") | ||||
|             logger.debug(f"Found file {filename} at {url}") | ||||
|  | ||||
|             old_path = pathlib.PurePath(filename) | ||||
|             new_path = transform(old_path) | ||||
|             if new_path is None: | ||||
|                 continue | ||||
|             logger.debug(f"Transformed from {old_path} to {new_path}") | ||||
|  | ||||
|             temp_path = orga.temp_file() | ||||
|             self._download(url, temp_path) | ||||
|             orga.add_file(temp_path, new_path) | ||||
|  | ||||
|     def _download(self, url, to_path): | ||||
|         with self._session.get(url, stream=True) as r: | ||||
|             stream_to_path(r, to_path) | ||||
							
								
								
									
										109
									
								
								PFERD/ilias.py
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								PFERD/ilias.py
									
									
									
									
									
								
							| @@ -1,109 +0,0 @@ | ||||
| # ILIAS | ||||
|  | ||||
| import logging | ||||
| import pathlib | ||||
| import re | ||||
|  | ||||
| from .ilias_authenticators import ShibbolethAuthenticator | ||||
| from .organizer import Organizer | ||||
| from .utils import PrettyLogger | ||||
|  | ||||
| __all__ = ["Ilias"] | ||||
| logger = logging.getLogger(__name__) | ||||
| pretty = PrettyLogger(logger) | ||||
|  | ||||
| class Ilias: | ||||
|     FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") | ||||
|     DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)") | ||||
|  | ||||
|     def __init__(self, base_path, cookie_file): | ||||
|         self.base_path = base_path | ||||
|  | ||||
|         self._auth = ShibbolethAuthenticator(base_path / cookie_file) | ||||
|  | ||||
|     def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): | ||||
|         pretty.starting_synchronizer(to_dir, "ILIAS", f"ref_id {ref_id}") | ||||
|  | ||||
|         sync_path = pathlib.Path(self.base_path, to_dir) | ||||
|         orga = Organizer(self.base_path, sync_path) | ||||
|  | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|         files = self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) | ||||
|         self._download(orga, files, transform) | ||||
|  | ||||
|         orga.clean_sync_dir() | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|     def _crawl(self, dir_path, dir_id, filter_): | ||||
|         soup = self._auth.get_webpage(dir_id) | ||||
|  | ||||
|         found_files = [] | ||||
|  | ||||
|         files = self._find_files(soup) | ||||
|         for (name, file_id) in files: | ||||
|             path = dir_path / name | ||||
|             found_files.append((path, file_id)) | ||||
|             logger.debug(f"Found file {path}") | ||||
|  | ||||
|         dirs = self._find_dirs(soup) | ||||
|         for (name, ref_id) in dirs: | ||||
|             path = dir_path / name | ||||
|             logger.debug(f"Found dir {path}") | ||||
|             if filter_(path): | ||||
|                 logger.info(f"Searching {path}") | ||||
|                 files = self._crawl(path, ref_id, filter_) | ||||
|                 found_files.extend(files) | ||||
|             else: | ||||
|                 logger.info(f"Not searching {path}") | ||||
|  | ||||
|         return found_files | ||||
|  | ||||
|     def _download(self, orga, files, transform): | ||||
|         for (path, file_id) in sorted(files): | ||||
|             to_path = transform(path) | ||||
|             if to_path is not None: | ||||
|                 temp_path = orga.temp_file() | ||||
|                 self._auth.download_file(file_id, temp_path) | ||||
|                 orga.add_file(temp_path, to_path) | ||||
|  | ||||
|     def _find_files(self, soup): | ||||
|         files = [] | ||||
|         file_names = set() | ||||
|  | ||||
|         found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) | ||||
|         for element in found: | ||||
|             file_stem = element.string.strip().replace("/", ".") | ||||
|             file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() | ||||
|             file_id = re.search(self.FILE_RE, element.get("href")).group(1) | ||||
|  | ||||
|             file_name = f"{file_stem}.{file_type}" | ||||
|             if file_name in file_names: | ||||
|                 counter = 1 | ||||
|                 while True: | ||||
|                     file_name = f"{file_stem} (duplicate {counter}).{file_type}" | ||||
|                     if file_name in file_names: | ||||
|                         counter += 1 | ||||
|                     else: | ||||
|                         break | ||||
|  | ||||
|             files.append((file_name, file_id)) | ||||
|             file_names.add(file_name) | ||||
|  | ||||
|         return files | ||||
|  | ||||
|     def _find_dirs(self, soup): | ||||
|         dirs = [] | ||||
|  | ||||
|         found = soup.find_all("div", {"class": "alert", "role": "alert"}) | ||||
|         if found: | ||||
|             return [] | ||||
|  | ||||
|         found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) | ||||
|         for element in found: | ||||
|             dir_name = element.string.strip().replace("/", ".") | ||||
|             ref_id = re.search(self.DIR_RE, element.get("href")).group(1) | ||||
|             dir_id = f"fold_{ref_id}" | ||||
|             dirs.append((dir_name, dir_id)) | ||||
|  | ||||
|         return dirs | ||||
| @@ -1,181 +0,0 @@ | ||||
| # This file is called IliasAuthenticators because there are multiple mechanisms | ||||
| # for authenticating with Ilias (even though only the Shibboleth is currently | ||||
| # implemented). Most of what the ShibbolethAuthenticator currently does is | ||||
| # not Shibboleth specific; this mess would have to be cleaned up before | ||||
| # actually implementing any other authentication method. | ||||
| # | ||||
| # I think the only other method is the password prompt when clicking the log in | ||||
| # button. | ||||
|  | ||||
| import getpass | ||||
| import http.cookiejar | ||||
| import logging | ||||
| import time | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from .utils import ContentTypeException, stream_to_path | ||||
|  | ||||
| __all__ = ["ShibbolethAuthenticator"] | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| class ShibbolethAuthenticator: | ||||
|     ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" | ||||
|  | ||||
|     ALLOWED_CONTENT_TYPES = [ | ||||
|         "application/pdf", | ||||
|         "application/zip", | ||||
|         "application/msword", | ||||
|         "application/vnd.wolfram.nb", | ||||
|         "application/octet-stream", | ||||
|         "application/excel", | ||||
|         "text/xml", | ||||
|         "text/xml;charset=UTF-8", | ||||
|         "text/plain", | ||||
|         "text/plain;charset=UTF-8", | ||||
|         "image/jpeg", | ||||
|         "image/png", | ||||
|     ] | ||||
|  | ||||
|     def __init__(self, cookie_file) -> None: | ||||
|         # Because LWPCookieJar insists on the path being str-like instead of | ||||
|         # Path-like. | ||||
|         cookie_file = str(cookie_file) | ||||
|  | ||||
|         cookies = http.cookiejar.LWPCookieJar(cookie_file) | ||||
|         try: | ||||
|             logger.info(f"Loading old cookies from {cookie_file!r}") | ||||
|             cookies.load(ignore_discard=True) | ||||
|         except (FileNotFoundError, http.cookiejar.LoadError): | ||||
|             logger.warn(f"No (valid) cookie file found at {cookie_file!r}, ignoring...") | ||||
|  | ||||
|         self._session = requests.Session() | ||||
|         self._session.cookies = cookies | ||||
|  | ||||
|     def _authenticate(self): | ||||
|         """ | ||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||
|         cookies it receieves. | ||||
|  | ||||
|         This function should only be called whenever it is detected that you're | ||||
|         not logged in. The cookies obtained should be good for a few minutes, | ||||
|         maybe even an hour or two. | ||||
|         """ | ||||
|  | ||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||
|         # https://ilias.studium.kit.edu/login.php | ||||
|         logger.debug("Begin authentication process with ILIAS") | ||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" | ||||
|         data = { | ||||
|                 "sendLogin": "1", | ||||
|                 "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", | ||||
|                 "target": "/shib_login.php", | ||||
|                 "home_organization_selection": "Mit KIT-Account anmelden", | ||||
|         } | ||||
|         r = self._session.post(url, data=data) | ||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|  | ||||
|         # Attempt to login using credentials, if necessary | ||||
|         while not self._login_successful(soup): | ||||
|             # Searching the form here so that this fails before asking for | ||||
|             # credentials rather than after asking. | ||||
|             form = soup.find("form", {"class": "form2", "method": "post"}) | ||||
|             action = form["action"] | ||||
|  | ||||
|             print("Please enter Shibboleth credentials.") | ||||
|             username = getpass.getpass(prompt="Username: ") | ||||
|             password = getpass.getpass(prompt="Password: ") | ||||
|  | ||||
|             # Equivalent: Enter credentials in | ||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|             logger.debug("Attempt to log in to Shibboleth using credentials") | ||||
|             url = "https://idp.scc.kit.edu" + action | ||||
|             data = { | ||||
|                     "_eventId_proceed": "", | ||||
|                     "j_username": username, | ||||
|                     "j_password": password, | ||||
|             } | ||||
|             r = self._session.post(url, data=data) | ||||
|             soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|  | ||||
|             if not self._login_successful(soup): | ||||
|                 print("Incorrect credentials.") | ||||
|  | ||||
|         # Saving progress | ||||
|         logger.info("Saving cookies (successfully authenticated with Shibboleth)") | ||||
|         self._session.cookies.save(ignore_discard=True) | ||||
|  | ||||
|         # Equivalent: Being redirected via JS automatically | ||||
|         # (or clicking "Continue" if you have JS disabled) | ||||
|         logger.debug("Redirect back to ILIAS with login information") | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" | ||||
|         data = { # using the info obtained in the while loop above | ||||
|             "RelayState": relay_state["value"], | ||||
|             "SAMLResponse": saml_response["value"], | ||||
|         } | ||||
|         self._session.post(url, data=data) | ||||
|  | ||||
|         # Saving progress | ||||
|         logger.info("Saving cookies (successfully authenticated with ILIAS)") | ||||
|         self._session.cookies.save(ignore_discard=True) | ||||
|  | ||||
|     def _login_successful(self, soup): | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         return relay_state is not None and saml_response is not None | ||||
|  | ||||
|     def _is_logged_in(self, soup): | ||||
|         userlog = soup.find("li", {"id": "userlog"}) | ||||
|         return userlog is not None | ||||
|  | ||||
|     def get_webpage(self, object_id): | ||||
|         params = {"target": object_id} | ||||
|  | ||||
|         while True: | ||||
|             logger.debug(f"Getting {self.ILIAS_GOTO} {params}") | ||||
|             r = self._session.get(self.ILIAS_GOTO, params=params) | ||||
|             soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|  | ||||
|             if self._is_logged_in(soup): | ||||
|                 return soup | ||||
|             else: | ||||
|                 logger.info("Not logged in, authenticating...") | ||||
|                 self._authenticate() | ||||
|  | ||||
|     def get_webpage_by_refid(self, ref_id): | ||||
|         return self.get_webpage(f"fold_{ref_id}") | ||||
|  | ||||
|     def _download(self, url, params, to_path): | ||||
|         with self._session.get(url, params=params, stream=True) as r: | ||||
|             content_type = r.headers["content-type"] | ||||
|  | ||||
|             if content_type in self.ALLOWED_CONTENT_TYPES: | ||||
|                 # Yay, we got the file :) | ||||
|                 stream_to_path(r, to_path) | ||||
|                 return True | ||||
|             elif content_type == "text/html": | ||||
|                 # Dangit, we're probably not logged in. | ||||
|                 soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|                 if self._is_logged_in(soup): | ||||
|                     raise ContentTypeException( | ||||
|                             "Attempting to download a web page, not a file") | ||||
|                 return False | ||||
|             else: | ||||
|                 # What *did* we get? | ||||
|                 raise ContentTypeException( | ||||
|                         f"Unknown file of type {content_type}") | ||||
|  | ||||
|     def download_file(self, file_id, to_path): | ||||
|         params = {"target": file_id} | ||||
|  | ||||
|         while True: | ||||
|             success = self._download(self.ILIAS_GOTO, params, to_path) | ||||
|  | ||||
|             if success: | ||||
|                 return | ||||
|             else: | ||||
|                 logger.info("Not logged in, authenticating...") | ||||
|                 self._authenticate() | ||||
							
								
								
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| import asyncio | ||||
| import time | ||||
| from contextlib import asynccontextmanager | ||||
| from dataclasses import dataclass | ||||
| from typing import AsyncIterator, Optional | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Slot: | ||||
|     active: bool = False | ||||
|     last_left: Optional[float] = None | ||||
|  | ||||
|  | ||||
| class Limiter: | ||||
|     def __init__( | ||||
|             self, | ||||
|             task_limit: int, | ||||
|             download_limit: int, | ||||
|             task_delay: float | ||||
|     ): | ||||
|         if task_limit <= 0: | ||||
|             raise ValueError("task limit must be at least 1") | ||||
|         if download_limit <= 0: | ||||
|             raise ValueError("download limit must be at least 1") | ||||
|         if download_limit > task_limit: | ||||
|             raise ValueError("download limit can't be greater than task limit") | ||||
|         if task_delay < 0: | ||||
|             raise ValueError("Task delay must not be negative") | ||||
|  | ||||
|         self._slots = [Slot() for _ in range(task_limit)] | ||||
|         self._downloads = download_limit | ||||
|         self._delay = task_delay | ||||
|  | ||||
|         self._condition = asyncio.Condition() | ||||
|  | ||||
|     def _acquire_slot(self) -> Optional[Slot]: | ||||
|         for slot in self._slots: | ||||
|             if not slot.active: | ||||
|                 slot.active = True | ||||
|                 return slot | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     async def _wait_for_slot_delay(self, slot: Slot) -> None: | ||||
|         if slot.last_left is not None: | ||||
|             delay = slot.last_left + self._delay - time.time() | ||||
|             if delay > 0: | ||||
|                 await asyncio.sleep(delay) | ||||
|  | ||||
|     def _release_slot(self, slot: Slot) -> None: | ||||
|         slot.last_left = time.time() | ||||
|         slot.active = False | ||||
|  | ||||
|     @asynccontextmanager | ||||
|     async def limit_crawl(self) -> AsyncIterator[None]: | ||||
|         slot: Slot | ||||
|         async with self._condition: | ||||
|             while True: | ||||
|                 if found_slot := self._acquire_slot(): | ||||
|                     slot = found_slot | ||||
|                     break | ||||
|                 await self._condition.wait() | ||||
|  | ||||
|         await self._wait_for_slot_delay(slot) | ||||
|  | ||||
|         try: | ||||
|             yield | ||||
|         finally: | ||||
|             async with self._condition: | ||||
|                 self._release_slot(slot) | ||||
|                 self._condition.notify_all() | ||||
|  | ||||
|     @asynccontextmanager | ||||
|     async def limit_download(self) -> AsyncIterator[None]: | ||||
|         slot: Slot | ||||
|         async with self._condition: | ||||
|             while True: | ||||
|                 if self._downloads <= 0: | ||||
|                     await self._condition.wait() | ||||
|                     continue | ||||
|  | ||||
|                 if found_slot := self._acquire_slot(): | ||||
|                     slot = found_slot | ||||
|                     self._downloads -= 1 | ||||
|                     break | ||||
|  | ||||
|                 await self._condition.wait() | ||||
|  | ||||
|         await self._wait_for_slot_delay(slot) | ||||
|  | ||||
|         try: | ||||
|             yield | ||||
|         finally: | ||||
|             async with self._condition: | ||||
|                 self._release_slot(slot) | ||||
|                 self._downloads += 1 | ||||
|                 self._condition.notify_all() | ||||
							
								
								
									
										271
									
								
								PFERD/logging.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										271
									
								
								PFERD/logging.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,271 @@ | ||||
| import asyncio | ||||
| import sys | ||||
| import traceback | ||||
| from contextlib import asynccontextmanager, contextmanager | ||||
| # TODO In Python 3.9 and above, ContextManager is deprecated | ||||
| from typing import AsyncIterator, ContextManager, Iterator, List, Optional | ||||
|  | ||||
| from rich.console import Console, Group | ||||
| from rich.live import Live | ||||
| from rich.markup import escape | ||||
| from rich.panel import Panel | ||||
| from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, | ||||
|                            TransferSpeedColumn) | ||||
| from rich.table import Column | ||||
|  | ||||
|  | ||||
| class ProgressBar: | ||||
|     def __init__(self, progress: Progress, taskid: TaskID): | ||||
|         self._progress = progress | ||||
|         self._taskid = taskid | ||||
|  | ||||
|     def advance(self, amount: float = 1) -> None: | ||||
|         self._progress.advance(self._taskid, advance=amount) | ||||
|  | ||||
|     def set_total(self, total: float) -> None: | ||||
|         self._progress.update(self._taskid, total=total) | ||||
|         self._progress.start_task(self._taskid) | ||||
|  | ||||
|  | ||||
| class Log: | ||||
|     STATUS_WIDTH = 11 | ||||
|  | ||||
|     def __init__(self) -> None: | ||||
|         self.console = Console(highlight=False) | ||||
|  | ||||
|         self._crawl_progress = Progress( | ||||
|             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||
|             BarColumn(), | ||||
|             TimeRemainingColumn(), | ||||
|             expand=True, | ||||
|         ) | ||||
|         self._download_progress = Progress( | ||||
|             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||
|             TransferSpeedColumn(), | ||||
|             DownloadColumn(), | ||||
|             BarColumn(), | ||||
|             TimeRemainingColumn(), | ||||
|             expand=True, | ||||
|         ) | ||||
|  | ||||
|         self._live = Live(console=self.console, transient=True) | ||||
|         self._update_live() | ||||
|  | ||||
|         self._showing_progress = False | ||||
|         self._progress_suspended = False | ||||
|         self._lock = asyncio.Lock() | ||||
|         self._lines: List[str] = [] | ||||
|  | ||||
|         # Whether different parts of the output are enabled or disabled | ||||
|         self.output_explain = False | ||||
|         self.output_status = True | ||||
|         self.output_report = True | ||||
|  | ||||
|     def _update_live(self) -> None: | ||||
|         elements = [] | ||||
|         if self._crawl_progress.task_ids: | ||||
|             elements.append(self._crawl_progress) | ||||
|         if self._download_progress.task_ids: | ||||
|             elements.append(self._download_progress) | ||||
|  | ||||
|         group = Group(*elements) | ||||
|         self._live.update(group) | ||||
|  | ||||
|     @contextmanager | ||||
|     def show_progress(self) -> Iterator[None]: | ||||
|         if self._showing_progress: | ||||
|             raise RuntimeError("Calling 'show_progress' while already showing progress") | ||||
|  | ||||
|         self._showing_progress = True | ||||
|         try: | ||||
|             with self._live: | ||||
|                 yield | ||||
|         finally: | ||||
|             self._showing_progress = False | ||||
|  | ||||
|     @asynccontextmanager | ||||
|     async def exclusive_output(self) -> AsyncIterator[None]: | ||||
|         if not self._showing_progress: | ||||
|             raise RuntimeError("Calling 'exclusive_output' while not showing progress") | ||||
|  | ||||
|         async with self._lock: | ||||
|             self._progress_suspended = True | ||||
|             self._live.stop() | ||||
|             try: | ||||
|                 yield | ||||
|             finally: | ||||
|                 self._live.start() | ||||
|                 self._progress_suspended = False | ||||
|                 for line in self._lines: | ||||
|                     self.print(line) | ||||
|                 self._lines = [] | ||||
|  | ||||
|     def unlock(self) -> None: | ||||
|         """ | ||||
|         Get rid of an exclusive output state. | ||||
|  | ||||
|         This function is meant to let PFERD print log messages after the event | ||||
|         loop was forcibly stopped and if it will not be started up again. After | ||||
|         this is called, it is not safe to use any functions except the logging | ||||
|         functions (print, warn, ...). | ||||
|         """ | ||||
|  | ||||
|         self._progress_suspended = False | ||||
|         for line in self._lines: | ||||
|             self.print(line) | ||||
|  | ||||
|     def print(self, text: str) -> None: | ||||
|         """ | ||||
|         Print a normal message. Allows markup. | ||||
|         """ | ||||
|  | ||||
|         if self._progress_suspended: | ||||
|             self._lines.append(text) | ||||
|         else: | ||||
|             self.console.print(text) | ||||
|  | ||||
|     # TODO Print errors (and warnings?) to stderr | ||||
|  | ||||
|     def warn(self, text: str) -> None: | ||||
|         """ | ||||
|         Print a warning message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.print(f"[bold bright_red]Warning[/] {escape(text)}") | ||||
|  | ||||
|     def warn_contd(self, text: str) -> None: | ||||
|         """ | ||||
|         Print further lines of a warning message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.print(f"{escape(text)}") | ||||
|  | ||||
|     def error(self, text: str) -> None: | ||||
|         """ | ||||
|         Print an error message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") | ||||
|  | ||||
|     def error_contd(self, text: str) -> None: | ||||
|         """ | ||||
|         Print further lines of an error message. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         self.print(f"[red]{escape(text)}") | ||||
|  | ||||
|     def unexpected_exception(self) -> None: | ||||
|         """ | ||||
|         Call this in an "except" clause to log an unexpected exception. | ||||
|         """ | ||||
|  | ||||
|         t, v, tb = sys.exc_info() | ||||
|         if t is None or v is None or tb is None: | ||||
|             # We're not currently handling an exception, so somebody probably | ||||
|             # called this function where they shouldn't. | ||||
|             self.error("Something unexpected happened") | ||||
|             self.error_contd("") | ||||
|             for line in traceback.format_stack(): | ||||
|                 self.error_contd(line[:-1])  # Without the newline | ||||
|             self.error_contd("") | ||||
|         else: | ||||
|             self.error("An unexpected exception occurred") | ||||
|             self.error_contd("") | ||||
|             self.error_contd(traceback.format_exc()) | ||||
|  | ||||
|         # Our print function doesn't take types other than strings, but the | ||||
|         # underlying rich.print function does. This call is a special case | ||||
|         # anyways, and we're calling it internally, so this should be fine. | ||||
|         self.print(Panel.fit(""" | ||||
| Please copy your program output and send it to the PFERD maintainers, either | ||||
| directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | ||||
|         """.strip()))  # type: ignore | ||||
|  | ||||
|     def explain_topic(self, text: str) -> None: | ||||
|         """ | ||||
|         Print a top-level explain text. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         if self.output_explain: | ||||
|             self.print(f"[yellow]{escape(text)}") | ||||
|  | ||||
|     def explain(self, text: str) -> None: | ||||
|         """ | ||||
|         Print an indented explain text. Allows no markup. | ||||
|         """ | ||||
|  | ||||
|         if self.output_explain: | ||||
|             self.print(f"  {escape(text)}") | ||||
|  | ||||
|     def status(self, style: str, action: str, text: str, suffix: str = "") -> None: | ||||
|         """ | ||||
|         Print a status update while crawling. Allows markup in the "style" | ||||
|         argument which will be applied to the "action" string. | ||||
|         """ | ||||
|  | ||||
|         if self.output_status: | ||||
|             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||
|             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||
|  | ||||
|     def report(self, text: str) -> None: | ||||
|         """ | ||||
|         Print a report after crawling. Allows markup. | ||||
|         """ | ||||
|  | ||||
|         if self.output_report: | ||||
|             self.print(text) | ||||
|  | ||||
|     @contextmanager | ||||
|     def _bar( | ||||
|             self, | ||||
|             progress: Progress, | ||||
|             description: str, | ||||
|             total: Optional[float], | ||||
|     ) -> Iterator[ProgressBar]: | ||||
|         if total is None: | ||||
|             # Indeterminate progress bar | ||||
|             taskid = progress.add_task(description, start=False) | ||||
|         else: | ||||
|             taskid = progress.add_task(description, total=total) | ||||
|         self._update_live() | ||||
|  | ||||
|         try: | ||||
|             yield ProgressBar(progress, taskid) | ||||
|         finally: | ||||
|             progress.remove_task(taskid) | ||||
|             self._update_live() | ||||
|  | ||||
|     def crawl_bar( | ||||
|             self, | ||||
|             style: str, | ||||
|             action: str, | ||||
|             text: str, | ||||
|             total: Optional[float] = None, | ||||
|     ) -> ContextManager[ProgressBar]: | ||||
|         """ | ||||
|         Allows markup in the "style" argument which will be applied to the | ||||
|         "action" string. | ||||
|         """ | ||||
|  | ||||
|         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||
|         description = f"{style}{action}[/] {text}" | ||||
|         return self._bar(self._crawl_progress, description, total) | ||||
|  | ||||
|     def download_bar( | ||||
|             self, | ||||
|             style: str, | ||||
|             action: str, | ||||
|             text: str, | ||||
|             total: Optional[float] = None, | ||||
|     ) -> ContextManager[ProgressBar]: | ||||
|         """ | ||||
|         Allows markup in the "style" argument which will be applied to the | ||||
|         "action" string. | ||||
|         """ | ||||
|  | ||||
|         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||
|         description = f"{style}{action}[/] {text}" | ||||
|         return self._bar(self._download_progress, description, total) | ||||
|  | ||||
|  | ||||
| log = Log() | ||||
							
								
								
									
										108
									
								
								PFERD/norbert.py
									
									
									
									
									
								
							
							
						
						
									
										108
									
								
								PFERD/norbert.py
									
									
									
									
									
								
							| @@ -1,108 +0,0 @@ | ||||
| # Norberts Prog-Tuts | ||||
|  | ||||
| import logging | ||||
| import pathlib | ||||
| import re | ||||
| import zipfile | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from .organizer import Organizer | ||||
| from .utils import rename, stream_to_path, PrettyLogger | ||||
|  | ||||
| __all__ = ["Norbert"] | ||||
| logger = logging.getLogger(__name__) | ||||
| pretty = PrettyLogger(logger) | ||||
|  | ||||
| class Norbert: | ||||
|     BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" | ||||
|     LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") | ||||
|  | ||||
|     def __init__(self, base_path): | ||||
|         self.base_path = base_path | ||||
|  | ||||
|         self._session = requests.Session() | ||||
|  | ||||
|     def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): | ||||
|         pretty.starting_synchronizer(to_dir, "Norbert") | ||||
|  | ||||
|         sync_path = pathlib.Path(self.base_path, to_dir) | ||||
|         orga = Organizer(self.base_path, sync_path) | ||||
|  | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|         files = self._crawl() | ||||
|         self._download(orga, files, transform, unzip) | ||||
|  | ||||
|         orga.clean_sync_dir() | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|     def _crawl(self): | ||||
|         url = self.BASE_URL | ||||
|         r = self._session.get(url) | ||||
|  | ||||
|         # replace undecodeable characters with a placeholder | ||||
|         #text = r.raw.decode("utf-8", "replace") | ||||
|  | ||||
|         text = r.text | ||||
|         soup = bs4.BeautifulSoup(text, "html.parser") | ||||
|  | ||||
|         files = [] | ||||
|  | ||||
|         for found in soup.find_all("a", href=self.LINK_RE): | ||||
|             url = found["href"] | ||||
|             full_url = self.BASE_URL + url | ||||
|  | ||||
|             filename = re.search(self.LINK_RE, url).group(1) | ||||
|             path = pathlib.PurePath(filename) | ||||
|  | ||||
|             logger.debug(f"Found zip file {filename} at {full_url}") | ||||
|             files.append((path, full_url)) | ||||
|  | ||||
|         return files | ||||
|  | ||||
|     def _download(self, orga, files, transform, unzip): | ||||
|         for path, url in sorted(files): | ||||
|             # Yes, we want the zip file contents | ||||
|             if unzip(path): | ||||
|                 logger.debug(f"Downloading and unzipping {path}") | ||||
|                 zip_path = rename(path, path.stem) | ||||
|  | ||||
|                 # Download zip file | ||||
|                 temp_file = orga.temp_file() | ||||
|                 self._download_zip(url, temp_file) | ||||
|  | ||||
|                 # Search the zip file for files to extract | ||||
|                 temp_dir = orga.temp_dir() | ||||
|                 with zipfile.ZipFile(temp_file, "r") as zf: | ||||
|                     for info in zf.infolist(): | ||||
|                         # Only interested in the files themselves, the directory | ||||
|                         # structure is created automatically by orga.add_file() | ||||
|                         if info.is_dir(): | ||||
|                             continue | ||||
|  | ||||
|                         file_path = zip_path / pathlib.PurePath(info.filename) | ||||
|                         logger.debug(f"Found {info.filename} at path {file_path}") | ||||
|  | ||||
|                         new_path = transform(file_path) | ||||
|                         if new_path is not None: | ||||
|                             # Extract to temp file and add, the usual deal | ||||
|                             temp_file = orga.temp_file() | ||||
|                             extracted_path = zf.extract(info, temp_dir) | ||||
|                             extracted_path = pathlib.Path(extracted_path) | ||||
|                             orga.add_file(extracted_path, new_path) | ||||
|  | ||||
|             # No, we only want the zip file itself | ||||
|             else: | ||||
|                 logger.debug(f"Only downloading {path}") | ||||
|  | ||||
|                 new_path = transform(path) | ||||
|                 if new_path is not None: | ||||
|                     temp_file = orga.temp_file() | ||||
|                     self._download_zip(url, temp_file) | ||||
|                     orga.add_file(temp_file, new_path) | ||||
|  | ||||
|     def _download_zip(self, url, to_path): | ||||
|         with self._session.get(url, stream=True) as r: | ||||
|             stream_to_path(r, to_path) | ||||
| @@ -1,151 +0,0 @@ | ||||
| import filecmp | ||||
| import logging | ||||
| import pathlib | ||||
| import shutil | ||||
|  | ||||
| from . import utils | ||||
|  | ||||
| __all__ = ["Organizer"] | ||||
| logger = logging.getLogger(__name__) | ||||
| pretty = utils.PrettyLogger(logger) | ||||
|  | ||||
| class Organizer: | ||||
|     def __init__(self, base_dir, sync_dir): | ||||
|         """ | ||||
|         base_dir - the .tmp directory will be created here | ||||
|         sync_dir - synced files will be moved here | ||||
|         Both are expected to be concrete pathlib paths. | ||||
|         """ | ||||
|  | ||||
|         self._base_dir = base_dir | ||||
|         self._sync_dir = sync_dir | ||||
|  | ||||
|         self._temp_dir = pathlib.Path(self._base_dir, ".tmp") | ||||
|         self._temp_nr = 0 | ||||
|  | ||||
|         # check if base/sync dir exist? | ||||
|  | ||||
|         self._added_files = set() | ||||
|  | ||||
|     def clean_temp_dir(self): | ||||
|         if self._temp_dir.exists(): | ||||
|             shutil.rmtree(self._temp_dir) | ||||
|         self._temp_dir.mkdir(exist_ok=True) | ||||
|         logger.debug(f"Cleaned temp dir: {self._temp_dir}") | ||||
|  | ||||
|     def temp_dir(self): | ||||
|         nr = self._temp_nr | ||||
|         self._temp_nr += 1 | ||||
|         temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() | ||||
|         logger.debug(f"Produced new temp dir: {temp_dir}") | ||||
|         return temp_dir | ||||
|  | ||||
|     def temp_file(self): | ||||
|         # generate the path to a new temp file in base_path/.tmp/ | ||||
|         # make sure no two paths are the same | ||||
|         nr = self._temp_nr | ||||
|         self._temp_nr += 1 | ||||
|         temp_file =  pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve() | ||||
|         logger.debug(f"Produced new temp file: {temp_file}") | ||||
|         return temp_file | ||||
|  | ||||
|     def add_file(self, from_path, to_path): | ||||
|         if not from_path.exists(): | ||||
|             raise utils.FileNotFoundException(f"Could not add file at {from_path}") | ||||
|  | ||||
|         # check if sync_dir/to_path is inside sync_dir? | ||||
|         to_path = pathlib.Path(self._sync_dir, to_path) | ||||
|  | ||||
|         if to_path.exists() and to_path.is_dir(): | ||||
|             if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): | ||||
|                 shutil.rmtree(to_path) | ||||
|             else: | ||||
|                 logger.warn(f"Could not add file {to_path}") | ||||
|                 return | ||||
|  | ||||
|         if to_path.exists(): | ||||
|             if filecmp.cmp(from_path, to_path, shallow=False): | ||||
|                 pretty.ignored_file(to_path) | ||||
|  | ||||
|                 # remember path for later reference | ||||
|                 self._added_files.add(to_path.resolve()) | ||||
|                 logger.debug(f"Added file {to_path.resolve()}") | ||||
|  | ||||
|                 # No further action needed, especially not overwriting symlinks... | ||||
|                 return | ||||
|             else: | ||||
|                 pretty.modified_file(to_path) | ||||
|         else: | ||||
|             pretty.new_file(to_path) | ||||
|  | ||||
|         # copy the file from from_path to sync_dir/to_path | ||||
|         # If the file being replaced was a symlink, the link itself is overwritten, | ||||
|         # not the file the link points to. | ||||
|         to_path.parent.mkdir(parents=True, exist_ok=True) | ||||
|         from_path.replace(to_path) | ||||
|         logger.debug(f"Moved {from_path} to {to_path}") | ||||
|  | ||||
|         # remember path for later reference, after the new file was written | ||||
|         # This is necessary here because otherwise, resolve() would resolve the symlink too. | ||||
|         self._added_files.add(to_path.resolve()) | ||||
|         logger.debug(f"Added file {to_path.resolve()}") | ||||
|  | ||||
|     def clean_sync_dir(self): | ||||
|         self._clean_dir(self._sync_dir, remove_parent=False) | ||||
|         logger.debug(f"Cleaned sync dir: {self._sync_dir}") | ||||
|  | ||||
|     def _clean_dir(self, path, remove_parent=True): | ||||
|         for child in sorted(path.iterdir()): | ||||
|             logger.debug(f"Looking at {child.resolve()}") | ||||
|             if child.is_dir(): | ||||
|                 self._clean_dir(child, remove_parent=True) | ||||
|             elif child.resolve() not in self._added_files: | ||||
|                 if self._prompt_yes_no(f"Delete {child}?", default=False): | ||||
|                     child.unlink() | ||||
|                     logger.debug(f"Deleted {child}") | ||||
|  | ||||
|         if remove_parent: | ||||
|             try: | ||||
|                 path.rmdir() | ||||
|             except OSError: # directory not empty | ||||
|                 pass | ||||
|  | ||||
|     def _prompt_yes_no(self, question, default=None): | ||||
|         if default is True: | ||||
|             prompt = "[Y/n]" | ||||
|         elif default is False: | ||||
|             prompt = "[y/N]" | ||||
|         else: | ||||
|             prompt = "[y/n]" | ||||
|  | ||||
|         text = f"{question} {prompt} " | ||||
|         WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'." | ||||
|  | ||||
|         while True: | ||||
|             response = input(text).strip().lower() | ||||
|             if response in {"yes", "ye", "y"}: | ||||
|                 return True | ||||
|             elif response in {"no", "n"}: | ||||
|                 return False | ||||
|             elif response == "": | ||||
|                 if default is None: | ||||
|                     print(WRONG_REPLY) | ||||
|                 else: | ||||
|                     return default | ||||
|             else: | ||||
|                 print(WRONG_REPLY) | ||||
|  | ||||
| # How to use: | ||||
| # | ||||
| # 1. Before downloading any files | ||||
| # orga = Organizer("/home/user/sync/", "/home/user/sync/bookstore/") | ||||
| # orga.clean_temp_dir() | ||||
| # | ||||
| # 2. Downloading a file | ||||
| # tempfile = orga.temp_file() | ||||
| # download_something_to(tempfile) | ||||
| # orga.add_file(tempfile, "books/douglas_adams/hhgttg" | ||||
| # | ||||
| # 3. After downloading all files | ||||
| # orga.clean_sync_dir() | ||||
| # orga.clean_temp_dir() | ||||
							
								
								
									
										517
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										517
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,517 @@ | ||||
| import filecmp | ||||
| import json | ||||
| import os | ||||
| import random | ||||
| import shutil | ||||
| import string | ||||
| from contextlib import contextmanager | ||||
| from dataclasses import dataclass | ||||
| from datetime import datetime | ||||
| from enum import Enum | ||||
| from pathlib import Path, PurePath | ||||
| from typing import BinaryIO, Iterator, Optional, Tuple | ||||
|  | ||||
| from .logging import log | ||||
| from .report import Report, ReportLoadError | ||||
| from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no | ||||
|  | ||||
| SUFFIX_CHARS = string.ascii_lowercase + string.digits | ||||
| SUFFIX_LENGTH = 6 | ||||
| TRIES = 5 | ||||
|  | ||||
|  | ||||
| class OutputDirError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Redownload(Enum): | ||||
|     NEVER = "never" | ||||
|     NEVER_SMART = "never-smart" | ||||
|     ALWAYS = "always" | ||||
|     ALWAYS_SMART = "always-smart" | ||||
|  | ||||
|     @staticmethod | ||||
|     def from_string(string: str) -> "Redownload": | ||||
|         try: | ||||
|             return Redownload(string) | ||||
|         except ValueError: | ||||
|             raise ValueError("must be one of 'never', 'never-smart'," | ||||
|                              " 'always', 'always-smart'") | ||||
|  | ||||
|  | ||||
| class OnConflict(Enum): | ||||
|     PROMPT = "prompt" | ||||
|     LOCAL_FIRST = "local-first" | ||||
|     REMOTE_FIRST = "remote-first" | ||||
|     NO_DELETE = "no-delete" | ||||
|  | ||||
|     @staticmethod | ||||
|     def from_string(string: str) -> "OnConflict": | ||||
|         try: | ||||
|             return OnConflict(string) | ||||
|         except ValueError: | ||||
|             raise ValueError("must be one of 'prompt', 'local-first'," | ||||
|                              " 'remote-first', 'no-delete'") | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Heuristics: | ||||
|     mtime: Optional[datetime] | ||||
|  | ||||
|  | ||||
| class FileSink: | ||||
|     def __init__(self, file: BinaryIO): | ||||
|         self._file = file | ||||
|         self._done = False | ||||
|  | ||||
|     @property | ||||
|     def file(self) -> BinaryIO: | ||||
|         return self._file | ||||
|  | ||||
|     def done(self) -> None: | ||||
|         self._done = True | ||||
|  | ||||
|     def is_done(self) -> bool: | ||||
|         return self._done | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class DownloadInfo: | ||||
|     remote_path: PurePath | ||||
|     path: PurePath | ||||
|     local_path: Path | ||||
|     tmp_path: Path | ||||
|     heuristics: Heuristics | ||||
|     on_conflict: OnConflict | ||||
|     success: bool = False | ||||
|  | ||||
|  | ||||
| class FileSinkToken(ReusableAsyncContextManager[FileSink]): | ||||
|     # Whenever this class is entered, it creates a new temporary file and | ||||
|     # returns a corresponding FileSink. | ||||
|     # | ||||
|     # When it is exited again, the file is closed and information about the | ||||
|     # download handed back to the OutputDirectory. | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             output_dir: "OutputDirectory", | ||||
|             remote_path: PurePath, | ||||
|             path: PurePath, | ||||
|             local_path: Path, | ||||
|             heuristics: Heuristics, | ||||
|             on_conflict: OnConflict, | ||||
|     ): | ||||
|         super().__init__() | ||||
|  | ||||
|         self._output_dir = output_dir | ||||
|         self._remote_path = remote_path | ||||
|         self._path = path | ||||
|         self._local_path = local_path | ||||
|         self._heuristics = heuristics | ||||
|         self._on_conflict = on_conflict | ||||
|  | ||||
|     async def _on_aenter(self) -> FileSink: | ||||
|         tmp_path, file = await self._output_dir._create_tmp_file(self._local_path) | ||||
|         sink = FileSink(file) | ||||
|  | ||||
|         async def after_download() -> None: | ||||
|             await self._output_dir._after_download(DownloadInfo( | ||||
|                 self._remote_path, | ||||
|                 self._path, | ||||
|                 self._local_path, | ||||
|                 tmp_path, | ||||
|                 self._heuristics, | ||||
|                 self._on_conflict, | ||||
|                 sink.is_done(), | ||||
|             )) | ||||
|  | ||||
|         self._stack.push_async_callback(after_download) | ||||
|         self._stack.enter_context(file) | ||||
|  | ||||
|         return sink | ||||
|  | ||||
|  | ||||
| class OutputDirectory: | ||||
|     REPORT_FILE = PurePath(".report") | ||||
|  | ||||
|     def __init__( | ||||
|             self, | ||||
|             root: Path, | ||||
|             redownload: Redownload, | ||||
|             on_conflict: OnConflict, | ||||
|     ): | ||||
|         if os.name == "nt": | ||||
|             # Windows limits the path length to 260 for some historical reason. | ||||
|             # If you want longer paths, you will have to add the "\\?\" prefix | ||||
|             # in front of your path. See: | ||||
|             # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation | ||||
|             self._root = Path("\\\\?\\" + str(root.absolute())) | ||||
|         else: | ||||
|             self._root = root | ||||
|  | ||||
|         self._redownload = redownload | ||||
|         self._on_conflict = on_conflict | ||||
|  | ||||
|         self._report_path = self.resolve(self.REPORT_FILE) | ||||
|         self._report = Report() | ||||
|         self._prev_report: Optional[Report] = None | ||||
|  | ||||
|         self.register_reserved(self.REPORT_FILE) | ||||
|  | ||||
|     @property | ||||
|     def report(self) -> Report: | ||||
|         return self._report | ||||
|  | ||||
|     @property | ||||
|     def prev_report(self) -> Optional[Report]: | ||||
|         return self._prev_report | ||||
|  | ||||
|     def prepare(self) -> None: | ||||
|         log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") | ||||
|  | ||||
|         try: | ||||
|             self._root.mkdir(parents=True, exist_ok=True) | ||||
|         except OSError: | ||||
|             raise OutputDirError("Failed to create base directory") | ||||
|  | ||||
|     def register_reserved(self, path: PurePath) -> None: | ||||
|         self._report.mark_reserved(path) | ||||
|  | ||||
|     def resolve(self, path: PurePath) -> Path: | ||||
|         """ | ||||
|         May throw an OutputDirError. | ||||
|         """ | ||||
|  | ||||
|         if ".." in path.parts: | ||||
|             raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") | ||||
|         if "." in path.parts: | ||||
|             raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") | ||||
|  | ||||
|         return self._root / path | ||||
|  | ||||
|     def _should_download( | ||||
|             self, | ||||
|             local_path: Path, | ||||
|             heuristics: Heuristics, | ||||
|             redownload: Redownload, | ||||
|             on_conflict: OnConflict, | ||||
|     ) -> bool: | ||||
|         if not local_path.exists(): | ||||
|             log.explain("No corresponding file present locally") | ||||
|             return True | ||||
|  | ||||
|         if on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             # Whatever is here, it will never be overwritten, so we don't need | ||||
|             # to download the file. | ||||
|             log.explain("Conflict resolution is 'local-first' and path exists") | ||||
|             return False | ||||
|  | ||||
|         if not local_path.is_file(): | ||||
|             # We know that there is *something* here that's not a file. | ||||
|             log.explain("Non-file (probably a directory) present locally") | ||||
|  | ||||
|             # If on_conflict is LOCAL_FIRST or NO_DELETE, we know that it would | ||||
|             # never be overwritten. It also doesn't have any relevant stats to | ||||
|             # update. This means that we don't have to download the file | ||||
|             # because we'd just always throw it away again. | ||||
|             if on_conflict in {OnConflict.LOCAL_FIRST, OnConflict.NO_DELETE}: | ||||
|                 log.explain(f"Conflict resolution is {on_conflict.value!r}") | ||||
|                 return False | ||||
|  | ||||
|             return True | ||||
|  | ||||
|         log.explain(f"Redownload policy is {redownload.value}") | ||||
|  | ||||
|         if redownload == Redownload.NEVER: | ||||
|             return False | ||||
|         elif redownload == Redownload.ALWAYS: | ||||
|             return True | ||||
|  | ||||
|         stat = local_path.stat() | ||||
|  | ||||
|         remote_newer = None | ||||
|  | ||||
|         # Python on Windows crashes when faced with timestamps around the unix epoch | ||||
|         if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): | ||||
|             mtime = heuristics.mtime | ||||
|             remote_newer = mtime.timestamp() > stat.st_mtime | ||||
|             if remote_newer: | ||||
|                 log.explain("Remote file seems to be newer") | ||||
|             else: | ||||
|                 log.explain("Remote file doesn't seem to be newer") | ||||
|  | ||||
|         if redownload == Redownload.NEVER_SMART: | ||||
|             if remote_newer is None: | ||||
|                 return False | ||||
|             else: | ||||
|                 return remote_newer | ||||
|         elif redownload == Redownload.ALWAYS_SMART: | ||||
|             if remote_newer is None: | ||||
|                 return True | ||||
|             else: | ||||
|                 return remote_newer | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{redownload!r} is not a valid redownload policy") | ||||
|  | ||||
|     # The following conflict resolution functions all return False if the local | ||||
|     # file(s) should be kept and True if they should be replaced by the remote | ||||
|     # files. | ||||
|  | ||||
|     async def _conflict_lfrf( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Replace {fmt_path(path)} with remote file?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return True | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     async def _conflict_ldrf( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return False | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     async def _conflict_lfrd( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|             parent: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return False | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     async def _conflict_delete_lf( | ||||
|             self, | ||||
|             on_conflict: OnConflict, | ||||
|             path: PurePath, | ||||
|     ) -> bool: | ||||
|         if on_conflict == OnConflict.PROMPT: | ||||
|             async with log.exclusive_output(): | ||||
|                 prompt = f"Delete {fmt_path(path)}?" | ||||
|                 return await prompt_yes_no(prompt, default=False) | ||||
|         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||
|             return False | ||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||
|             return True | ||||
|         elif on_conflict == OnConflict.NO_DELETE: | ||||
|             return False | ||||
|  | ||||
|         # This should never be reached | ||||
|         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||
|  | ||||
|     def _tmp_path(self, base: Path, suffix_length: int) -> Path: | ||||
|         prefix = "" if base.name.startswith(".") else "." | ||||
|         suffix = "".join(random.choices(SUFFIX_CHARS, k=suffix_length)) | ||||
|         name = f"{prefix}{base.name}.tmp.{suffix}" | ||||
|         return base.parent / name | ||||
|  | ||||
|     async def _create_tmp_file( | ||||
|             self, | ||||
|             local_path: Path, | ||||
|     ) -> Tuple[Path, BinaryIO]: | ||||
|         """ | ||||
|         May raise an OutputDirError. | ||||
|         """ | ||||
|  | ||||
|         # Create tmp file | ||||
|         for attempt in range(TRIES): | ||||
|             suffix_length = SUFFIX_LENGTH + 2 * attempt | ||||
|             tmp_path = self._tmp_path(local_path, suffix_length) | ||||
|             try: | ||||
|                 return tmp_path, open(tmp_path, "xb") | ||||
|             except FileExistsError: | ||||
|                 pass  # Try again | ||||
|  | ||||
|         raise OutputDirError("Failed to create temporary file") | ||||
|  | ||||
|     async def download( | ||||
|             self, | ||||
|             remote_path: PurePath, | ||||
|             path: PurePath, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
|     ) -> Optional[FileSinkToken]: | ||||
|         """ | ||||
|         May throw an OutputDirError, a MarkDuplicateError or a | ||||
|         MarkConflictError. | ||||
|         """ | ||||
|  | ||||
|         heuristics = Heuristics(mtime) | ||||
|         redownload = self._redownload if redownload is None else redownload | ||||
|         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||
|         local_path = self.resolve(path) | ||||
|  | ||||
|         self._report.mark(path) | ||||
|  | ||||
|         if not self._should_download(local_path, heuristics, redownload, on_conflict): | ||||
|             return None | ||||
|  | ||||
|         # Detect and solve local-dir-remote-file conflict | ||||
|         if local_path.is_dir(): | ||||
|             log.explain("Conflict: There's a directory in place of the local file") | ||||
|             if await self._conflict_ldrf(on_conflict, path): | ||||
|                 log.explain("Result: Delete the obstructing directory") | ||||
|                 shutil.rmtree(local_path) | ||||
|             else: | ||||
|                 log.explain("Result: Keep the obstructing directory") | ||||
|                 return None | ||||
|  | ||||
|         # Detect and solve local-file-remote-dir conflict | ||||
|         for parent in path.parents: | ||||
|             local_parent = self.resolve(parent) | ||||
|             if local_parent.exists() and not local_parent.is_dir(): | ||||
|                 log.explain("Conflict: One of the local file's parents is a file") | ||||
|                 if await self._conflict_lfrd(on_conflict, path, parent): | ||||
|                     log.explain("Result: Delete the obstructing file") | ||||
|                     local_parent.unlink() | ||||
|                     break | ||||
|                 else: | ||||
|                     log.explain("Result: Keep the obstructing file") | ||||
|                     return None | ||||
|  | ||||
|         # Ensure parent directory exists | ||||
|         local_path.parent.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
|         return FileSinkToken(self, remote_path, path, local_path, heuristics, on_conflict) | ||||
|  | ||||
|     def _update_metadata(self, info: DownloadInfo) -> None: | ||||
|         if mtime := info.heuristics.mtime: | ||||
|             mtimestamp = mtime.timestamp() | ||||
|             os.utime(info.local_path, times=(mtimestamp, mtimestamp)) | ||||
|  | ||||
|     @contextmanager | ||||
|     def _ensure_deleted(self, path: Path) -> Iterator[None]: | ||||
|         try: | ||||
|             yield | ||||
|         finally: | ||||
|             path.unlink(missing_ok=True) | ||||
|  | ||||
|     async def _after_download(self, info: DownloadInfo) -> None: | ||||
|         with self._ensure_deleted(info.tmp_path): | ||||
|             log.status("[bold cyan]", "Downloaded", fmt_path(info.remote_path)) | ||||
|             log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") | ||||
|  | ||||
|             changed = False | ||||
|  | ||||
|             if not info.success: | ||||
|                 log.explain("Download unsuccessful, aborting") | ||||
|                 return | ||||
|  | ||||
|             # Solve conflicts arising from existing local file | ||||
|             if info.local_path.exists(): | ||||
|                 changed = True | ||||
|  | ||||
|                 if filecmp.cmp(info.local_path, info.tmp_path): | ||||
|                     log.explain("Contents identical with existing file") | ||||
|                     log.explain("Updating metadata of existing file") | ||||
|                     self._update_metadata(info) | ||||
|                     return | ||||
|  | ||||
|                 log.explain("Conflict: The local and remote versions differ") | ||||
|                 if await self._conflict_lfrf(info.on_conflict, info.path): | ||||
|                     log.explain("Result: Replacing local with remote version") | ||||
|                 else: | ||||
|                     log.explain("Result: Keeping local version") | ||||
|                     return | ||||
|  | ||||
|             info.tmp_path.replace(info.local_path) | ||||
|             log.explain("Updating file metadata") | ||||
|             self._update_metadata(info) | ||||
|  | ||||
|             if changed: | ||||
|                 log.status("[bold bright_yellow]", "Changed", fmt_path(info.path)) | ||||
|                 self._report.change_file(info.path) | ||||
|             else: | ||||
|                 log.status("[bold bright_green]", "Added", fmt_path(info.path)) | ||||
|                 self._report.add_file(info.path) | ||||
|  | ||||
|     async def cleanup(self) -> None: | ||||
|         await self._cleanup_dir(self._root, PurePath(), delete_self=False) | ||||
|  | ||||
|     async def _cleanup(self, path: Path, pure: PurePath) -> None: | ||||
|         if path.is_dir(): | ||||
|             await self._cleanup_dir(path, pure) | ||||
|         elif path.is_file(): | ||||
|             await self._cleanup_file(path, pure) | ||||
|  | ||||
|     async def _cleanup_dir(self, path: Path, pure: PurePath, delete_self: bool = True) -> None: | ||||
|         for child in sorted(path.iterdir()): | ||||
|             pure_child = pure / child.name | ||||
|             await self._cleanup(child, pure_child) | ||||
|  | ||||
|         if delete_self: | ||||
|             try: | ||||
|                 path.rmdir() | ||||
|             except OSError: | ||||
|                 pass | ||||
|  | ||||
|     async def _cleanup_file(self, path: Path, pure: PurePath) -> None: | ||||
|         if self._report.is_marked(pure): | ||||
|             return | ||||
|  | ||||
|         if await self._conflict_delete_lf(self._on_conflict, pure): | ||||
|             try: | ||||
|                 path.unlink() | ||||
|                 log.status("[bold bright_magenta]", "Deleted", fmt_path(pure)) | ||||
|                 self._report.delete_file(pure) | ||||
|             except OSError: | ||||
|                 pass | ||||
|         else: | ||||
|             log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) | ||||
|             self._report.not_delete_file(pure) | ||||
|  | ||||
|     def load_prev_report(self) -> None: | ||||
|         log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") | ||||
|         try: | ||||
|             self._prev_report = Report.load(self._report_path) | ||||
|             log.explain("Loaded report successfully") | ||||
|         except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e: | ||||
|             log.explain("Failed to load report") | ||||
|             log.explain(str(e)) | ||||
|  | ||||
|     def store_report(self) -> None: | ||||
|         log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}") | ||||
|         try: | ||||
|             self._report.store(self._report_path) | ||||
|             log.explain("Stored report successfully") | ||||
|         except OSError as e: | ||||
|             log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}") | ||||
|             log.warn_contd(str(e)) | ||||
							
								
								
									
										194
									
								
								PFERD/pferd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										194
									
								
								PFERD/pferd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,194 @@ | ||||
| from pathlib import Path | ||||
| from typing import Dict, List, Optional | ||||
|  | ||||
| from rich.markup import escape | ||||
|  | ||||
| from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection | ||||
| from .config import Config, ConfigOptionError | ||||
| from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler | ||||
| from .logging import log | ||||
| from .utils import fmt_path | ||||
|  | ||||
|  | ||||
| class PferdLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Pferd: | ||||
|     def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]): | ||||
|         """ | ||||
|         May throw PferdLoadError. | ||||
|         """ | ||||
|  | ||||
|         self._config = config | ||||
|         self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips) | ||||
|  | ||||
|         self._authenticators: Dict[str, Authenticator] = {} | ||||
|         self._crawlers: Dict[str, Crawler] = {} | ||||
|  | ||||
|     def _find_config_crawlers(self, config: Config) -> List[str]: | ||||
|         crawl_sections = [] | ||||
|  | ||||
|         for name, section in config.crawl_sections(): | ||||
|             if CrawlerSection(section).skip(): | ||||
|                 log.explain(f"Skipping {name!r}") | ||||
|             else: | ||||
|                 crawl_sections.append(name) | ||||
|  | ||||
|         return crawl_sections | ||||
|  | ||||
|     def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: | ||||
|         if len(cli_crawlers) != len(set(cli_crawlers)): | ||||
|             raise PferdLoadError("Some crawlers were selected multiple times") | ||||
|  | ||||
|         crawl_sections = [name for name, _ in config.crawl_sections()] | ||||
|  | ||||
|         crawlers_to_run = []  # With crawl: prefix | ||||
|         unknown_names = []  # Without crawl: prefix | ||||
|  | ||||
|         for name in cli_crawlers: | ||||
|             section_name = f"crawl:{name}" | ||||
|             if section_name in crawl_sections: | ||||
|                 log.explain(f"Crawler section named {section_name!r} exists") | ||||
|                 crawlers_to_run.append(section_name) | ||||
|             else: | ||||
|                 log.explain(f"There's no crawler section named {section_name!r}") | ||||
|                 unknown_names.append(name) | ||||
|  | ||||
|         if unknown_names: | ||||
|             if len(unknown_names) == 1: | ||||
|                 [name] = unknown_names | ||||
|                 raise PferdLoadError(f"There is no crawler named {name!r}") | ||||
|             else: | ||||
|                 names_str = ", ".join(repr(name) for name in unknown_names) | ||||
|                 raise PferdLoadError(f"There are no crawlers named {names_str}") | ||||
|  | ||||
|         return crawlers_to_run | ||||
|  | ||||
|     def _find_crawlers_to_run( | ||||
|             self, | ||||
|             config: Config, | ||||
|             cli_crawlers: Optional[List[str]], | ||||
|             cli_skips: Optional[List[str]], | ||||
|     ) -> List[str]: | ||||
|         log.explain_topic("Deciding which crawlers to run") | ||||
|  | ||||
|         crawlers: List[str] | ||||
|         if cli_crawlers is None: | ||||
|             log.explain("No crawlers specified on CLI") | ||||
|             log.explain("Running crawlers specified in config") | ||||
|             crawlers = self._find_config_crawlers(config) | ||||
|         else: | ||||
|             log.explain("Crawlers specified on CLI") | ||||
|             crawlers = self._find_cli_crawlers(config, cli_crawlers) | ||||
|  | ||||
|         skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set() | ||||
|         for crawler in crawlers: | ||||
|             if crawler in skips: | ||||
|                 log.explain(f"Skipping crawler {crawler!r}") | ||||
|         crawlers = [crawler for crawler in crawlers if crawler not in skips] | ||||
|  | ||||
|         return crawlers | ||||
|  | ||||
|     def _load_authenticators(self) -> None: | ||||
|         for name, section in self._config.auth_sections(): | ||||
|             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||
|  | ||||
|             auth_type = AuthSection(section).type() | ||||
|             authenticator_constructor = AUTHENTICATORS.get(auth_type) | ||||
|             if authenticator_constructor is None: | ||||
|                 raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") | ||||
|  | ||||
|             authenticator = authenticator_constructor(name, section, self._config) | ||||
|             self._authenticators[name] = authenticator | ||||
|  | ||||
|     def _load_crawlers(self) -> None: | ||||
|         # Cookie sharing | ||||
|         kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} | ||||
|  | ||||
|         for name, section in self._config.crawl_sections(): | ||||
|             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||
|  | ||||
|             crawl_type = CrawlerSection(section).type() | ||||
|             crawler_constructor = CRAWLERS.get(crawl_type) | ||||
|             if crawler_constructor is None: | ||||
|                 raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") | ||||
|  | ||||
|             crawler = crawler_constructor(name, section, self._config, self._authenticators) | ||||
|             self._crawlers[name] = crawler | ||||
|  | ||||
|             if self._config.default_section.share_cookies(): | ||||
|                 if isinstance(crawler, KitIliasWebCrawler): | ||||
|                     crawler.share_cookies(kit_ilias_web_paths) | ||||
|  | ||||
|     def debug_transforms(self) -> None: | ||||
|         for name in self._crawlers_to_run: | ||||
|             crawler = self._crawlers[name] | ||||
|             log.print("") | ||||
|             log.print(f"[bold bright_cyan]Debugging transforms[/] for {escape(name)}") | ||||
|             crawler.debug_transforms() | ||||
|  | ||||
|     async def run(self, debug_transforms: bool) -> None: | ||||
|         """ | ||||
|         May throw ConfigOptionError. | ||||
|         """ | ||||
|  | ||||
|         # These two functions must run inside the same event loop as the | ||||
|         # crawlers, so that any new objects (like Conditions or Futures) can | ||||
|         # obtain the correct event loop. | ||||
|         self._load_authenticators() | ||||
|         self._load_crawlers() | ||||
|  | ||||
|         if debug_transforms: | ||||
|             log.output_explain = True | ||||
|             log.output_report = False | ||||
|             self.debug_transforms() | ||||
|             return | ||||
|  | ||||
|         log.print("") | ||||
|  | ||||
|         for name in self._crawlers_to_run: | ||||
|             crawler = self._crawlers[name] | ||||
|  | ||||
|             log.print(f"[bold bright_cyan]Running[/] {escape(name)}") | ||||
|  | ||||
|             try: | ||||
|                 await crawler.run() | ||||
|             except (CrawlError, AuthError) as e: | ||||
|                 log.error(str(e)) | ||||
|             except Exception: | ||||
|                 log.unexpected_exception() | ||||
|  | ||||
|     def print_report(self) -> None: | ||||
|         for name in self._crawlers_to_run: | ||||
|             crawler = self._crawlers.get(name) | ||||
|             if crawler is None: | ||||
|                 continue  # Crawler failed to load | ||||
|  | ||||
|             log.report("") | ||||
|             log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") | ||||
|  | ||||
|             something_changed = False | ||||
|             for path in sorted(crawler.report.added_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_green]Added[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.changed_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_yellow]Changed[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.deleted_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.not_deleted_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_magenta]Not deleted[/] {fmt_path(path)}") | ||||
|  | ||||
|             for warning in crawler.report.encountered_warnings: | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_red]Warning[/] {warning}") | ||||
|  | ||||
|             for error in crawler.report.encountered_errors: | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_red]Error[/] {error}") | ||||
|  | ||||
|             if not something_changed: | ||||
|                 log.report("  Nothing changed") | ||||
							
								
								
									
										238
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										238
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,238 @@ | ||||
| import json | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Dict, List, Optional, Set | ||||
|  | ||||
|  | ||||
| class ReportLoadError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class MarkDuplicateError(Exception): | ||||
|     """ | ||||
|     Tried to mark a file that was already marked. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: PurePath): | ||||
|         super().__init__(f"A previous file already used path {path}") | ||||
|         self.path = path | ||||
|  | ||||
|  | ||||
| class MarkConflictError(Exception): | ||||
|     """ | ||||
|     Marking the path would have caused a conflict. | ||||
|  | ||||
|     A conflict can have two reasons: Either the new file has the same path as | ||||
|     the parent directory of a known file, or a parent directory of the new file | ||||
|     has the same path as a known file. In either case, adding the new file | ||||
|     would require a file and a directory to share the same path, which is | ||||
|     usually not possible. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path: PurePath, collides_with: PurePath): | ||||
|         super().__init__(f"File at {path} collides with previous file at {collides_with}") | ||||
|         self.path = path | ||||
|         self.collides_with = collides_with | ||||
|  | ||||
|  | ||||
| # TODO Use PurePath.is_relative_to when updating to 3.9 | ||||
| def is_relative_to(a: PurePath, b: PurePath) -> bool: | ||||
|     try: | ||||
|         a.relative_to(b) | ||||
|         return True | ||||
|     except ValueError: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| class Report: | ||||
|     """ | ||||
|     A report of a synchronization. Includes all files found by the crawler, as | ||||
|     well as the set of changes made to local files. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self) -> None: | ||||
|         # Paths found by the crawler, untransformed | ||||
|         self.found_paths: Set[PurePath] = set() | ||||
|  | ||||
|         # Files reserved for metadata files (e. g. the report file or cookies) | ||||
|         # that can't be overwritten by user transforms and won't be cleaned up | ||||
|         # at the end. | ||||
|         self.reserved_files: Set[PurePath] = set() | ||||
|  | ||||
|         # Files found by the crawler, transformed. Only includes files that | ||||
|         # were downloaded (or a download was attempted) | ||||
|         self.known_files: Set[PurePath] = set() | ||||
|  | ||||
|         self.added_files: Set[PurePath] = set() | ||||
|         self.changed_files: Set[PurePath] = set() | ||||
|         self.deleted_files: Set[PurePath] = set() | ||||
|         # Files that should have been deleted by the cleanup but weren't | ||||
|         self.not_deleted_files: Set[PurePath] = set() | ||||
|  | ||||
|         # Custom crawler-specific data | ||||
|         self.custom: Dict[str, Any] = dict() | ||||
|  | ||||
|         # Encountered errors and warnings | ||||
|         self.encountered_warnings: List[str] = [] | ||||
|         self.encountered_errors: List[str] = [] | ||||
|  | ||||
|     @staticmethod | ||||
|     def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: | ||||
|         result: Any = data.get(key, []) | ||||
|  | ||||
|         if not isinstance(result, list): | ||||
|             raise ReportLoadError(f"Incorrect format: {key!r} is not a list") | ||||
|  | ||||
|         for elem in result: | ||||
|             if not isinstance(elem, str): | ||||
|                 raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings") | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     @staticmethod | ||||
|     def _get_str_dictionary(data: Dict[str, Any], key: str) -> Dict[str, Any]: | ||||
|         result: Dict[str, Any] = data.get(key, {}) | ||||
|  | ||||
|         if not isinstance(result, dict): | ||||
|             raise ReportLoadError(f"Incorrect format: {key!r} is not a dictionary") | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     @classmethod | ||||
|     def load(cls, path: Path) -> "Report": | ||||
|         """ | ||||
|         May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError. | ||||
|         """ | ||||
|  | ||||
|         with open(path, encoding="utf-8") as f: | ||||
|             data = json.load(f) | ||||
|  | ||||
|         if not isinstance(data, dict): | ||||
|             raise ReportLoadError("Incorrect format: Root is not an object") | ||||
|  | ||||
|         self = cls() | ||||
|         for elem in self._get_list_of_strs(data, "found"): | ||||
|             self.found(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "reserved"): | ||||
|             self.mark_reserved(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "known"): | ||||
|             self.mark(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "added"): | ||||
|             self.add_file(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "changed"): | ||||
|             self.change_file(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "deleted"): | ||||
|             self.delete_file(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "not_deleted"): | ||||
|             self.not_delete_file(PurePath(elem)) | ||||
|         self.custom = self._get_str_dictionary(data, "custom") | ||||
|         self.encountered_errors = self._get_list_of_strs(data, "encountered_errors") | ||||
|         self.encountered_warnings = self._get_list_of_strs(data, "encountered_warnings") | ||||
|  | ||||
|         return self | ||||
|  | ||||
|     def store(self, path: Path) -> None: | ||||
|         """ | ||||
|         May raise OSError. | ||||
|         """ | ||||
|  | ||||
|         data = { | ||||
|             "found": [str(path) for path in sorted(self.found_paths)], | ||||
|             "reserved": [str(path) for path in sorted(self.reserved_files)], | ||||
|             "known": [str(path) for path in sorted(self.known_files)], | ||||
|             "added": [str(path) for path in sorted(self.added_files)], | ||||
|             "changed": [str(path) for path in sorted(self.changed_files)], | ||||
|             "deleted": [str(path) for path in sorted(self.deleted_files)], | ||||
|             "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], | ||||
|             "custom": self.custom, | ||||
|             "encountered_warnings": self.encountered_warnings, | ||||
|             "encountered_errors": self.encountered_errors, | ||||
|         } | ||||
|  | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             json.dump(data, f, indent=2, sort_keys=True) | ||||
|             f.write("\n")  # json.dump doesn't do this | ||||
|  | ||||
|     def found(self, path: PurePath) -> None: | ||||
|         self.found_paths.add(path) | ||||
|  | ||||
|     def mark_reserved(self, path: PurePath) -> None: | ||||
|         if path in self.marked: | ||||
|             raise RuntimeError("Trying to reserve an already reserved file") | ||||
|  | ||||
|         self.reserved_files.add(path) | ||||
|  | ||||
|     def mark(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Mark a previously unknown file as known. | ||||
|  | ||||
|         May throw a MarkDuplicateError or a MarkConflictError. For more detail, | ||||
|         see the respective exception's docstring. | ||||
|         """ | ||||
|  | ||||
|         for other in self.marked: | ||||
|             if path == other: | ||||
|                 raise MarkDuplicateError(path) | ||||
|  | ||||
|             if is_relative_to(path, other) or is_relative_to(other, path): | ||||
|                 raise MarkConflictError(path, other) | ||||
|  | ||||
|         self.known_files.add(path) | ||||
|  | ||||
|     @property | ||||
|     def marked(self) -> Set[PurePath]: | ||||
|         return self.known_files | self.reserved_files | ||||
|  | ||||
|     def is_marked(self, path: PurePath) -> bool: | ||||
|         return path in self.marked | ||||
|  | ||||
|     def add_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.added_files.add(path) | ||||
|  | ||||
|     def change_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.changed_files.add(path) | ||||
|  | ||||
|     def delete_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.deleted_files.add(path) | ||||
|  | ||||
|     def not_delete_file(self, path: PurePath) -> None: | ||||
|         """ | ||||
|         Unlike mark(), this function accepts any paths. | ||||
|         """ | ||||
|  | ||||
|         self.not_deleted_files.add(path) | ||||
|  | ||||
|     def add_custom_value(self, key: str, value: Any) -> None: | ||||
|         """ | ||||
|         Adds a custom value under the passed key, overwriting any existing | ||||
|         """ | ||||
|         self.custom[key] = value | ||||
|  | ||||
|     def get_custom_value(self, key: str) -> Optional[Any]: | ||||
|         """ | ||||
|         Retrieves a custom value for the given key. | ||||
|         """ | ||||
|         return self.custom.get(key) | ||||
|  | ||||
|     def add_error(self, error: str) -> None: | ||||
|         """ | ||||
|         Adds an error to this report's error list. | ||||
|         """ | ||||
|         self.encountered_errors.append(error) | ||||
|  | ||||
|     def add_warning(self, warning: str) -> None: | ||||
|         """ | ||||
|         Adds a warning to this report's warning list. | ||||
|         """ | ||||
|         self.encountered_warnings.append(warning) | ||||
							
								
								
									
										111
									
								
								PFERD/ti.py
									
									
									
									
									
								
							
							
						
						
									
										111
									
								
								PFERD/ti.py
									
									
									
									
									
								
							| @@ -1,111 +0,0 @@ | ||||
| # Fakultät für Mathematik (FfM) | ||||
|  | ||||
| import getpass | ||||
| import logging | ||||
| import pathlib | ||||
| import re | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
|  | ||||
| from .organizer import Organizer | ||||
| from .utils import stream_to_path, PrettyLogger | ||||
|  | ||||
| __all__ = ["Ti"] | ||||
| logger = logging.getLogger(__name__) | ||||
| pretty = PrettyLogger(logger) | ||||
|  | ||||
| class Ti: | ||||
|     BASE_URL = "http://ti.ira.uka.de/" | ||||
|     FILE_RE = re.compile(r"^.+\.pdf$") | ||||
|  | ||||
|     def __init__(self, base_path): | ||||
|         self.base_path = base_path | ||||
|  | ||||
|         self._session = requests.Session() | ||||
|         self._credentials = None | ||||
|  | ||||
|     def synchronize(self, urlpart, to_dir, transform=lambda x: x, | ||||
|             filter=lambda x: True): | ||||
|         pretty.starting_synchronizer(to_dir, "Ti", urlpart) | ||||
|  | ||||
|         sync_path = pathlib.Path(self.base_path, to_dir) | ||||
|  | ||||
|         orga = Organizer(self.base_path, sync_path) | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|         self._reset_credentials() | ||||
|  | ||||
|         available = self._find_available(urlpart) | ||||
|  | ||||
|         for name, address in sorted(available.items()): | ||||
|             path = pathlib.PurePath(name) | ||||
|             if filter(path): | ||||
|                 self._crawl(urlpart + address, path, orga, transform) | ||||
|             else: | ||||
|                 loggwe.info(f"Skipping {name}/") | ||||
|  | ||||
|         orga.clean_sync_dir() | ||||
|         orga.clean_temp_dir() | ||||
|  | ||||
|         self._reset_credentials() | ||||
|  | ||||
|     def _find_available(self, urlpart): | ||||
|         url = self.BASE_URL + urlpart | ||||
|         r = self._session.get(url) | ||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|  | ||||
|         available = {} | ||||
|  | ||||
|         if soup.find(href="./Vorlesung/Vorlesung.php"): | ||||
|             logger.info("Found Folien/") | ||||
|             available["Folien"] = "/Vorlesung/" | ||||
|         if soup.find(href="./Uebungen/Uebungen.php"): | ||||
|             logger.info("Found Blätter/") | ||||
|             available["Blätter"] = "/Uebungen/" | ||||
|         if soup.find(href="./Tutorien/Tutorien.php"): | ||||
|             logger.info("Found Tutorien/") | ||||
|             available["Tutorien"] = "/Tutorien/" | ||||
|  | ||||
|         return available | ||||
|  | ||||
|     def _crawl(self, urlpart, path, orga, transform): | ||||
|         url = self.BASE_URL + urlpart | ||||
|         r = self._session.get(url) | ||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") | ||||
|  | ||||
|         for filelink in soup.find_all("a", href=self.FILE_RE): | ||||
|             filepath = path / filelink["href"] | ||||
|             fileurl = url + "/" + filelink["href"] | ||||
|  | ||||
|             new_path = transform(filepath) | ||||
|             if new_path is None: | ||||
|                 continue | ||||
|             logger.debug(f"Transformed from {filepath} to {new_path}") | ||||
|  | ||||
|             temp_path = orga.temp_file() | ||||
|             self._download(fileurl, temp_path) | ||||
|             orga.add_file(temp_path, new_path) | ||||
|  | ||||
|  | ||||
|     def _get_credentials(self): | ||||
|         if self._credentials is None: | ||||
|             print("Please enter Ti credentials.") | ||||
|             username = getpass.getpass(prompt="Username: ") | ||||
|             password = getpass.getpass(prompt="Password: ") | ||||
|             self._credentials = (username, password) | ||||
|         return self._credentials | ||||
|  | ||||
|     def _reset_credentials(self): | ||||
|         self._credentials = None | ||||
|  | ||||
|     def _download(self, url, to_path): | ||||
|         while True: | ||||
|             username, password = self._get_credentials() | ||||
|             with self._session.get(url, stream=True, auth=(username, password)) as r: | ||||
|                 if r.ok: | ||||
|                     stream_to_path(r, to_path) | ||||
|                     return | ||||
|                 else: | ||||
|                     print("Incorrect credentials.") | ||||
|                     self._reset_credentials() | ||||
							
								
								
									
										439
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										439
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,439 @@ | ||||
| import ast | ||||
| import re | ||||
| from abc import ABC, abstractmethod | ||||
| from dataclasses import dataclass | ||||
| from enum import Enum | ||||
| from pathlib import PurePath | ||||
| from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union | ||||
|  | ||||
| from .logging import log | ||||
| from .utils import fmt_path, str_path | ||||
|  | ||||
|  | ||||
| class ArrowHead(Enum): | ||||
|     NORMAL = 0 | ||||
|     SEQUENCE = 1 | ||||
|  | ||||
|  | ||||
| class Ignore: | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Empty: | ||||
|     pass | ||||
|  | ||||
|  | ||||
| RightSide = Union[str, Ignore, Empty] | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Transformed: | ||||
|     path: PurePath | ||||
|  | ||||
|  | ||||
| class Ignored: | ||||
|     pass | ||||
|  | ||||
|  | ||||
| TransformResult = Optional[Union[Transformed, Ignored]] | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class Rule: | ||||
|     left: str | ||||
|     left_index: int | ||||
|     name: str | ||||
|     head: ArrowHead | ||||
|     right: RightSide | ||||
|     right_index: int | ||||
|  | ||||
|     def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: | ||||
|         if isinstance(self.right, str): | ||||
|             return self.right | ||||
|         elif isinstance(self.right, Ignore): | ||||
|             return Ignored() | ||||
|         elif isinstance(self.right, Empty): | ||||
|             return Transformed(path) | ||||
|         else: | ||||
|             raise RuntimeError(f"Right side has invalid type {type(self.right)}") | ||||
|  | ||||
|  | ||||
| class Transformation(ABC): | ||||
|     def __init__(self, rule: Rule): | ||||
|         self.rule = rule | ||||
|  | ||||
|     @abstractmethod | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         pass | ||||
|  | ||||
|  | ||||
| class ExactTf(Transformation): | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         if path != PurePath(self.rule.left): | ||||
|             return None | ||||
|  | ||||
|         right = self.rule.right_result(path) | ||||
|         if not isinstance(right, str): | ||||
|             return right | ||||
|  | ||||
|         return Transformed(PurePath(right)) | ||||
|  | ||||
|  | ||||
| class ExactReTf(Transformation): | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         match = re.fullmatch(self.rule.left, str_path(path)) | ||||
|         if not match: | ||||
|             return None | ||||
|  | ||||
|         right = self.rule.right_result(path) | ||||
|         if not isinstance(right, str): | ||||
|             return right | ||||
|  | ||||
|         # For some reason, mypy thinks that "groups" has type List[str]. But | ||||
|         # since elements of "match.groups()" can be None, mypy is wrong. | ||||
|         groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) | ||||
|  | ||||
|         locals_dir: Dict[str, Union[str, int, float]] = {} | ||||
|         for i, group in enumerate(groups): | ||||
|             if group is None: | ||||
|                 continue | ||||
|  | ||||
|             locals_dir[f"g{i}"] = group | ||||
|  | ||||
|             try: | ||||
|                 locals_dir[f"i{i}"] = int(group) | ||||
|             except ValueError: | ||||
|                 pass | ||||
|  | ||||
|             try: | ||||
|                 locals_dir[f"f{i}"] = float(group) | ||||
|             except ValueError: | ||||
|                 pass | ||||
|  | ||||
|         result = eval(f"f{right!r}", {}, locals_dir) | ||||
|         return Transformed(PurePath(result)) | ||||
|  | ||||
|  | ||||
| class RenamingParentsTf(Transformation): | ||||
|     def __init__(self, sub_tf: Transformation): | ||||
|         super().__init__(sub_tf.rule) | ||||
|         self.sub_tf = sub_tf | ||||
|  | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         for i in range(len(path.parts), -1, -1): | ||||
|             parent = PurePath(*path.parts[:i]) | ||||
|             child = PurePath(*path.parts[i:]) | ||||
|  | ||||
|             transformed = self.sub_tf.transform(parent) | ||||
|             if not transformed: | ||||
|                 continue | ||||
|             elif isinstance(transformed, Transformed): | ||||
|                 return Transformed(transformed.path / child) | ||||
|             elif isinstance(transformed, Ignored): | ||||
|                 return transformed | ||||
|             else: | ||||
|                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||
|  | ||||
|         return None | ||||
|  | ||||
|  | ||||
| class RenamingPartsTf(Transformation): | ||||
|     def __init__(self, sub_tf: Transformation): | ||||
|         super().__init__(sub_tf.rule) | ||||
|         self.sub_tf = sub_tf | ||||
|  | ||||
|     def transform(self, path: PurePath) -> TransformResult: | ||||
|         result = PurePath() | ||||
|         any_part_matched = False | ||||
|         for part in path.parts: | ||||
|             transformed = self.sub_tf.transform(PurePath(part)) | ||||
|             if not transformed: | ||||
|                 result /= part | ||||
|             elif isinstance(transformed, Transformed): | ||||
|                 result /= transformed.path | ||||
|                 any_part_matched = True | ||||
|             elif isinstance(transformed, Ignored): | ||||
|                 return transformed | ||||
|             else: | ||||
|                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||
|  | ||||
|         if any_part_matched: | ||||
|             return Transformed(result) | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|  | ||||
| class RuleParseError(Exception): | ||||
|     def __init__(self, line: "Line", reason: str): | ||||
|         super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}") | ||||
|  | ||||
|         self.line = line | ||||
|         self.reason = reason | ||||
|  | ||||
|     def pretty_print(self) -> None: | ||||
|         log.error(f"Error parsing rule on line {self.line.line_nr}:") | ||||
|         log.error_contd(self.line.line) | ||||
|         spaces = " " * self.line.index | ||||
|         log.error_contd(f"{spaces}^--- {self.reason}") | ||||
|  | ||||
|  | ||||
| T = TypeVar("T") | ||||
|  | ||||
|  | ||||
| class Line: | ||||
|     def __init__(self, line: str, line_nr: int): | ||||
|         self._line = line | ||||
|         self._line_nr = line_nr | ||||
|         self._index = 0 | ||||
|  | ||||
|     @property | ||||
|     def line(self) -> str: | ||||
|         return self._line | ||||
|  | ||||
|     @property | ||||
|     def line_nr(self) -> int: | ||||
|         return self._line_nr | ||||
|  | ||||
|     @property | ||||
|     def index(self) -> int: | ||||
|         return self._index | ||||
|  | ||||
|     @index.setter | ||||
|     def index(self, index: int) -> None: | ||||
|         self._index = index | ||||
|  | ||||
|     @property | ||||
|     def rest(self) -> str: | ||||
|         return self.line[self.index:] | ||||
|  | ||||
|     def peek(self, amount: int = 1) -> str: | ||||
|         return self.rest[:amount] | ||||
|  | ||||
|     def take(self, amount: int = 1) -> str: | ||||
|         string = self.peek(amount) | ||||
|         self.index += len(string) | ||||
|         return string | ||||
|  | ||||
|     def expect(self, string: str) -> str: | ||||
|         if self.peek(len(string)) == string: | ||||
|             return self.take(len(string)) | ||||
|         else: | ||||
|             raise RuleParseError(self, f"Expected {string!r}") | ||||
|  | ||||
|     def expect_with(self, string: str, value: T) -> T: | ||||
|         self.expect(string) | ||||
|         return value | ||||
|  | ||||
|     def one_of(self, parsers: List[Callable[[], T]], description: str) -> T: | ||||
|         for parser in parsers: | ||||
|             index = self.index | ||||
|             try: | ||||
|                 return parser() | ||||
|             except RuleParseError: | ||||
|                 self.index = index | ||||
|  | ||||
|         raise RuleParseError(self, description) | ||||
|  | ||||
|  | ||||
| # RULE = LEFT SPACE '-' NAME '-' HEAD (SPACE RIGHT)? | ||||
| # SPACE = ' '+ | ||||
| # NAME = '' | 'exact' | 'name' | 're' | 'exact-re' | 'name-re' | ||||
| # HEAD = '>' | '>>' | ||||
| # LEFT = STR | QUOTED_STR | ||||
| # RIGHT = STR | QUOTED_STR | '!' | ||||
|  | ||||
|  | ||||
| def parse_zero_or_more_spaces(line: Line) -> None: | ||||
|     while line.peek() == " ": | ||||
|         line.take() | ||||
|  | ||||
|  | ||||
| def parse_one_or_more_spaces(line: Line) -> None: | ||||
|     line.expect(" ") | ||||
|     parse_zero_or_more_spaces(line) | ||||
|  | ||||
|  | ||||
| def parse_str(line: Line) -> str: | ||||
|     result = [] | ||||
|     while c := line.peek(): | ||||
|         if c == " ": | ||||
|             break | ||||
|         else: | ||||
|             line.take() | ||||
|             result.append(c) | ||||
|  | ||||
|     if result: | ||||
|         return "".join(result) | ||||
|     else: | ||||
|         raise RuleParseError(line, "Expected non-space character") | ||||
|  | ||||
|  | ||||
| QUOTATION_MARKS = {'"', "'"} | ||||
|  | ||||
|  | ||||
| def parse_quoted_str(line: Line) -> str: | ||||
|     escaped = False | ||||
|  | ||||
|     # Points to first character of string literal | ||||
|     start_index = line.index | ||||
|  | ||||
|     quotation_mark = line.peek() | ||||
|     if quotation_mark not in QUOTATION_MARKS: | ||||
|         raise RuleParseError(line, "Expected quotation mark") | ||||
|     line.take() | ||||
|  | ||||
|     while c := line.peek(): | ||||
|         if escaped: | ||||
|             escaped = False | ||||
|             line.take() | ||||
|         elif c == quotation_mark: | ||||
|             line.take() | ||||
|             stop_index = line.index | ||||
|             literal = line.line[start_index:stop_index] | ||||
|             try: | ||||
|                 return ast.literal_eval(literal) | ||||
|             except SyntaxError as e: | ||||
|                 line.index = start_index | ||||
|                 raise RuleParseError(line, str(e)) from e | ||||
|         elif c == "\\": | ||||
|             escaped = True | ||||
|             line.take() | ||||
|         else: | ||||
|             line.take() | ||||
|  | ||||
|     raise RuleParseError(line, "Expected end of string literal") | ||||
|  | ||||
|  | ||||
| def parse_left(line: Line) -> str: | ||||
|     if line.peek() in QUOTATION_MARKS: | ||||
|         return parse_quoted_str(line) | ||||
|     else: | ||||
|         return parse_str(line) | ||||
|  | ||||
|  | ||||
| def parse_right(line: Line) -> Union[str, Ignore]: | ||||
|     c = line.peek() | ||||
|     if c in QUOTATION_MARKS: | ||||
|         return parse_quoted_str(line) | ||||
|     else: | ||||
|         string = parse_str(line) | ||||
|         if string == "!": | ||||
|             return Ignore() | ||||
|         return string | ||||
|  | ||||
|  | ||||
| def parse_arrow_name(line: Line) -> str: | ||||
|     return line.one_of([ | ||||
|         lambda: line.expect("exact-re"), | ||||
|         lambda: line.expect("exact"), | ||||
|         lambda: line.expect("name-re"), | ||||
|         lambda: line.expect("name"), | ||||
|         lambda: line.expect("re"), | ||||
|         lambda: line.expect(""), | ||||
|     ], "Expected arrow name") | ||||
|  | ||||
|  | ||||
| def parse_arrow_head(line: Line) -> ArrowHead: | ||||
|     return line.one_of([ | ||||
|         lambda: line.expect_with(">>", ArrowHead.SEQUENCE), | ||||
|         lambda: line.expect_with(">", ArrowHead.NORMAL), | ||||
|     ], "Expected arrow head") | ||||
|  | ||||
|  | ||||
| def parse_eol(line: Line) -> None: | ||||
|     if line.peek(): | ||||
|         raise RuleParseError(line, "Expected end of line") | ||||
|  | ||||
|  | ||||
| def parse_rule(line: Line) -> Rule: | ||||
|     parse_zero_or_more_spaces(line) | ||||
|     left_index = line.index | ||||
|     left = parse_left(line) | ||||
|  | ||||
|     parse_one_or_more_spaces(line) | ||||
|  | ||||
|     line.expect("-") | ||||
|     name = parse_arrow_name(line) | ||||
|     line.expect("-") | ||||
|     head = parse_arrow_head(line) | ||||
|  | ||||
|     right_index = line.index | ||||
|     right: RightSide | ||||
|     try: | ||||
|         parse_zero_or_more_spaces(line) | ||||
|         parse_eol(line) | ||||
|         right = Empty() | ||||
|     except RuleParseError: | ||||
|         line.index = right_index | ||||
|         parse_one_or_more_spaces(line) | ||||
|         right = parse_right(line) | ||||
|         parse_eol(line) | ||||
|  | ||||
|     return Rule(left, left_index, name, head, right, right_index) | ||||
|  | ||||
|  | ||||
| def parse_transformation(line: Line) -> Transformation: | ||||
|     rule = parse_rule(line) | ||||
|  | ||||
|     if rule.name == "": | ||||
|         return RenamingParentsTf(ExactTf(rule)) | ||||
|     elif rule.name == "exact": | ||||
|         return ExactTf(rule) | ||||
|     elif rule.name == "name": | ||||
|         if len(PurePath(rule.left).parts) > 1: | ||||
|             line.index = rule.left_index | ||||
|             raise RuleParseError(line, "Expected name, not multiple segments") | ||||
|         return RenamingPartsTf(ExactTf(rule)) | ||||
|     elif rule.name == "re": | ||||
|         return RenamingParentsTf(ExactReTf(rule)) | ||||
|     elif rule.name == "exact-re": | ||||
|         return ExactReTf(rule) | ||||
|     elif rule.name == "name-re": | ||||
|         return RenamingPartsTf(ExactReTf(rule)) | ||||
|     else: | ||||
|         raise RuntimeError(f"Invalid arrow name {rule.name!r}") | ||||
|  | ||||
|  | ||||
| class Transformer: | ||||
|     def __init__(self, rules: str): | ||||
|         """ | ||||
|         May throw a RuleParseException. | ||||
|         """ | ||||
|  | ||||
|         self._tfs = [] | ||||
|         for i, line in enumerate(rules.split("\n")): | ||||
|             line = line.strip() | ||||
|             if line: | ||||
|                 tf = parse_transformation(Line(line, i)) | ||||
|                 self._tfs.append((line, tf)) | ||||
|  | ||||
|     def transform(self, path: PurePath) -> Optional[PurePath]: | ||||
|         for i, (line, tf) in enumerate(self._tfs): | ||||
|             log.explain(f"Testing rule {i+1}: {line}") | ||||
|  | ||||
|             try: | ||||
|                 result = tf.transform(path) | ||||
|             except Exception as e: | ||||
|                 log.warn(f"Error while testing rule {i+1}: {line}") | ||||
|                 log.warn_contd(str(e)) | ||||
|                 continue | ||||
|  | ||||
|             if not result: | ||||
|                 continue | ||||
|  | ||||
|             if isinstance(result, Ignored): | ||||
|                 log.explain("Match found, path ignored") | ||||
|                 return None | ||||
|  | ||||
|             if tf.rule.head == ArrowHead.NORMAL: | ||||
|                 log.explain(f"Match found, transformed path to {fmt_path(result.path)}") | ||||
|                 path = result.path | ||||
|                 break | ||||
|             elif tf.rule.head == ArrowHead.SEQUENCE: | ||||
|                 log.explain(f"Match found, updated path to {fmt_path(result.path)}") | ||||
|                 path = result.path | ||||
|             else: | ||||
|                 raise RuntimeError(f"Invalid transform result of type {type(result)}: {result}") | ||||
|  | ||||
|         log.explain(f"Final result: {fmt_path(path)}") | ||||
|         return path | ||||
							
								
								
									
										53
									
								
								PFERD/update.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								PFERD/update.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,53 @@ | ||||
| from dataclasses import dataclass | ||||
| import ssl | ||||
| from typing import Optional | ||||
| import aiohttp | ||||
| import certifi | ||||
|  | ||||
| from .version import NAME, VERSION | ||||
| from .logging import log | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class PferdUpdate: | ||||
|     release_url: str | ||||
|     version: str | ||||
|  | ||||
|  | ||||
| def _build_session() -> aiohttp.ClientSession: | ||||
|     return aiohttp.ClientSession( | ||||
|         headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||
|         connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||
|         timeout=aiohttp.ClientTimeout( | ||||
|             total=15 * 60, | ||||
|             connect=10, | ||||
|             sock_connect=10, | ||||
|             sock_read=10, | ||||
|         ) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| async def check_for_updates() -> None: | ||||
|     if new_version := await get_newer_version(): | ||||
|         log.warn( | ||||
|             f"{NAME} version out of date. " | ||||
|             + f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub." | ||||
|         ) | ||||
|         log.warn_contd(f"You can download it on GitHub: {new_version.release_url}") | ||||
|     else: | ||||
|         log.explain("No update found") | ||||
|  | ||||
|  | ||||
| async def get_newer_version() -> Optional[PferdUpdate]: | ||||
|     async with _build_session() as session: | ||||
|         async with session.get( | ||||
|             "https://api.github.com/repos/Garmelon/Pferd/releases/latest", | ||||
|             headers={"Accept": "application/vnd.github+json"} | ||||
|         ) as response: | ||||
|             release_information = await response.json() | ||||
|             tag_name: str = release_information["tag_name"] | ||||
|             tag_name = tag_name.removeprefix("v") | ||||
|             if VERSION == tag_name: | ||||
|                 return None | ||||
|  | ||||
|             return PferdUpdate(release_url=release_information["html_url"], version=tag_name) | ||||
							
								
								
									
										176
									
								
								PFERD/utils.py
									
									
									
									
									
								
							
							
						
						
									
										176
									
								
								PFERD/utils.py
									
									
									
									
									
								
							| @@ -1,64 +1,144 @@ | ||||
| import os | ||||
| import asyncio | ||||
| import getpass | ||||
| import sys | ||||
| import pathlib | ||||
| from colorama import Style | ||||
| from colorama import Fore | ||||
| import threading | ||||
| from abc import ABC, abstractmethod | ||||
| from contextlib import AsyncExitStack | ||||
| from pathlib import Path, PurePath | ||||
| from types import TracebackType | ||||
| from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar | ||||
| from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit | ||||
|  | ||||
| __all__ = [ | ||||
|     "get_base_dir", | ||||
|     "move", | ||||
|     "rename", | ||||
|     "stream_to_path", | ||||
|     "ContentTypeException", | ||||
|     "FileNotFoundException", | ||||
|     "PrettyLogger", | ||||
| ] | ||||
| import bs4 | ||||
|  | ||||
| def get_base_dir(script_file): | ||||
|     return pathlib.Path(os.path.dirname(os.path.abspath(script_file))) | ||||
| T = TypeVar("T") | ||||
|  | ||||
| def move(path, from_folders, to_folders): | ||||
|     l = len(from_folders) | ||||
|     if path.parts[:l] == from_folders: | ||||
|         return pathlib.PurePath(*to_folders, *path.parts[l:]) | ||||
|  | ||||
| def rename(path, to_name): | ||||
|     return pathlib.PurePath(*path.parts[:-1], to_name) | ||||
| async def in_daemon_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: | ||||
|     loop = asyncio.get_running_loop() | ||||
|     future: asyncio.Future[T] = asyncio.Future() | ||||
|  | ||||
| def stream_to_path(response, to_path, chunk_size=1024**2): | ||||
|     with open(to_path, 'wb') as fd: | ||||
|         for chunk in response.iter_content(chunk_size=chunk_size): | ||||
|             fd.write(chunk) | ||||
|     def thread_func() -> None: | ||||
|         result = func() | ||||
|         loop.call_soon_threadsafe(future.set_result, result) | ||||
|  | ||||
| def isOutputPipe(): | ||||
|     """Returns whether this program's output is attached to a pipe. | ||||
|     threading.Thread(target=thread_func, daemon=True).start() | ||||
|  | ||||
|     return await future | ||||
|  | ||||
|  | ||||
| async def ainput(prompt: str) -> str: | ||||
|     return await in_daemon_thread(lambda: input(prompt)) | ||||
|  | ||||
|  | ||||
| async def agetpass(prompt: str) -> str: | ||||
|     return await in_daemon_thread(lambda: getpass.getpass(prompt)) | ||||
|  | ||||
|  | ||||
| async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: | ||||
|     """ | ||||
|     Asks the user a yes/no question and returns their choice. | ||||
|     """ | ||||
|     return sys.stdout.isatty | ||||
|  | ||||
| class ContentTypeException(Exception): | ||||
|     pass | ||||
|     if default is True: | ||||
|         query += " [Y/n] " | ||||
|     elif default is False: | ||||
|         query += " [y/N] " | ||||
|     else: | ||||
|         query += " [y/n] " | ||||
|  | ||||
| class FileNotFoundException(Exception): | ||||
|     pass | ||||
|     while True: | ||||
|         response = (await ainput(query)).strip().lower() | ||||
|         if response == "y": | ||||
|             return True | ||||
|         elif response == "n": | ||||
|             return False | ||||
|         elif response == "" and default is not None: | ||||
|             return default | ||||
|  | ||||
| class PrettyLogger: | ||||
|         print("Please answer with 'y' or 'n'.") | ||||
|  | ||||
|     def __init__(self, logger): | ||||
|         self.logger = logger | ||||
|  | ||||
|     def modified_file(self, file_name): | ||||
|         self.logger.info(f"{Fore.MAGENTA}{Style.BRIGHT}Modified {file_name}.{Style.RESET_ALL}") | ||||
| def soupify(data: bytes) -> bs4.BeautifulSoup: | ||||
|     """ | ||||
|     Parses HTML to a beautifulsoup object. | ||||
|     """ | ||||
|  | ||||
|     def new_file(self, file_name): | ||||
|         self.logger.info(f"{Fore.GREEN}{Style.BRIGHT}Created {file_name}.{Style.RESET_ALL}") | ||||
|     return bs4.BeautifulSoup(data, "html.parser") | ||||
|  | ||||
|     def ignored_file(self, file_name): | ||||
|         self.logger.info(f"{Style.DIM}Ignored {file_name}.{Style.RESET_ALL}") | ||||
|  | ||||
|     def starting_synchronizer(self, target_directory, synchronizer_name, subject=None): | ||||
|         subject_str = f"{subject} " if subject else "" | ||||
|         self.logger.info("") | ||||
|         self.logger.info(( | ||||
|             f"{Fore.CYAN}{Style.BRIGHT}Synchronizing {subject_str}to {target_directory}" | ||||
|             f" using the {synchronizer_name} synchronizer.{Style.RESET_ALL}" | ||||
|         )) | ||||
| def url_set_query_param(url: str, param: str, value: str) -> str: | ||||
|     """ | ||||
|     Set a query parameter in an url, overwriting existing ones with the same name. | ||||
|     """ | ||||
|     scheme, netloc, path, query, fragment = urlsplit(url) | ||||
|     query_parameters = parse_qs(query) | ||||
|     query_parameters[param] = [value] | ||||
|     new_query_string = urlencode(query_parameters, doseq=True) | ||||
|  | ||||
|     return urlunsplit((scheme, netloc, path, new_query_string, fragment)) | ||||
|  | ||||
|  | ||||
| def url_set_query_params(url: str, params: Dict[str, str]) -> str: | ||||
|     """ | ||||
|     Sets multiple query parameters in an url, overwriting existing ones. | ||||
|     """ | ||||
|     result = url | ||||
|  | ||||
|     for key, val in params.items(): | ||||
|         result = url_set_query_param(result, key, val) | ||||
|  | ||||
|     return result | ||||
|  | ||||
|  | ||||
| def str_path(path: PurePath) -> str: | ||||
|     if not path.parts: | ||||
|         return "." | ||||
|     return "/".join(path.parts) | ||||
|  | ||||
|  | ||||
| def fmt_path(path: PurePath) -> str: | ||||
|     return repr(str_path(path)) | ||||
|  | ||||
|  | ||||
| def fmt_real_path(path: Path) -> str: | ||||
|     return repr(str(path.absolute())) | ||||
|  | ||||
|  | ||||
| class ReusableAsyncContextManager(ABC, Generic[T]): | ||||
|     def __init__(self) -> None: | ||||
|         self._active = False | ||||
|         self._stack = AsyncExitStack() | ||||
|  | ||||
|     @abstractmethod | ||||
|     async def _on_aenter(self) -> T: | ||||
|         pass | ||||
|  | ||||
|     async def __aenter__(self) -> T: | ||||
|         if self._active: | ||||
|             raise RuntimeError("Nested or otherwise concurrent usage is not allowed") | ||||
|  | ||||
|         self._active = True | ||||
|         await self._stack.__aenter__() | ||||
|  | ||||
|         # See https://stackoverflow.com/a/13075071 | ||||
|         try: | ||||
|             result: T = await self._on_aenter() | ||||
|         except:  # noqa: E722 do not use bare 'except' | ||||
|             if not await self.__aexit__(*sys.exc_info()): | ||||
|                 raise | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     async def __aexit__( | ||||
|             self, | ||||
|             exc_type: Optional[Type[BaseException]], | ||||
|             exc_value: Optional[BaseException], | ||||
|             traceback: Optional[TracebackType], | ||||
|     ) -> Optional[bool]: | ||||
|         if not self._active: | ||||
|             raise RuntimeError("__aexit__ called too many times") | ||||
|  | ||||
|         result = await self._stack.__aexit__(exc_type, exc_value, traceback) | ||||
|         self._active = False | ||||
|         return result | ||||
|   | ||||
							
								
								
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| NAME = "PFERD" | ||||
| VERSION = "3.4.1" | ||||
							
								
								
									
										146
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										146
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,39 +2,143 @@ | ||||
|  | ||||
| **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien | ||||
|  | ||||
| Other resources: | ||||
|  | ||||
| - [Config file format](CONFIG.md) | ||||
| - [Changelog](CHANGELOG.md) | ||||
| - [Development Guide](DEV.md) | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| Ensure that you have at least Python 3.7 installed (3.6 might also work, didn't | ||||
| test it though). | ||||
| ### Direct download | ||||
|  | ||||
| Binaries for Linux, Windows and Mac can be downloaded directly from the | ||||
| [latest release](https://github.com/Garmelon/PFERD/releases/latest). | ||||
|  | ||||
| ### With pip | ||||
|  | ||||
| Ensure you have at least Python 3.9 installed. Run the following command to | ||||
| install PFERD or upgrade it to the latest version: | ||||
|  | ||||
| To install PFERD or update your installation to the latest version, run this | ||||
| wherever you want to install/have installed PFERD: | ||||
| ``` | ||||
| $ pip install git+https://github.com/Garmelon/PFERD@v1.1.6 | ||||
| $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest | ||||
| ``` | ||||
|  | ||||
| The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. | ||||
|  | ||||
| ## Example setup | ||||
| ### With package managers | ||||
|  | ||||
| In this example, `python3` refers to at least Python 3.7. | ||||
| Unofficial packages are available for: | ||||
| - [AUR](https://aur.archlinux.org/packages/pferd) | ||||
| - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) | ||||
|  | ||||
| See also PFERD's [repology page](https://repology.org/project/pferd/versions). | ||||
|  | ||||
| ## Basic usage | ||||
|  | ||||
| PFERD can be run directly from the command line with no config file. Run `pferd | ||||
| -h` to get an overview of available commands and options. Run `pferd <command> | ||||
| -h` to see which options a command has. | ||||
|  | ||||
| For example, you can download your personal desktop from the KIT ILIAS like | ||||
| this: | ||||
|  | ||||
| A full example setup and initial use could look like: | ||||
| ``` | ||||
| $ mkdir Vorlesungen | ||||
| $ cd Vorlesungen | ||||
| $ python3 -m venv . | ||||
| $ . bin/activate | ||||
| $ pip install git+https://github.com/Garmelon/PFERD@v1.1.6 | ||||
| $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/master/example_config.py | ||||
| $ python3 example_config.py | ||||
| $ deactivate | ||||
| $ pferd kit-ilias-web desktop <output_directory> | ||||
| ``` | ||||
|  | ||||
| Subsequent runs of the program might look like: | ||||
| Also, you can download most ILIAS pages directly like this: | ||||
|  | ||||
| ``` | ||||
| $ cd Vorlesungen | ||||
| $ . bin/activate | ||||
| $ python3 example_config.py | ||||
| $ deactivate | ||||
| $ pferd kit-ilias-web <url> <output_directory> | ||||
| ``` | ||||
|  | ||||
| However, the CLI only lets you download a single thing at a time, and the | ||||
| resulting command can grow long quite quickly. Because of this, PFERD can also | ||||
| be used with a config file. | ||||
|  | ||||
| To get started, just take a command you've been using and add `--dump-config` | ||||
| directly after `pferd`, like this: | ||||
|  | ||||
| ``` | ||||
| $ pferd --dump-config kit-ilias-web <url> <output_directory> | ||||
| ``` | ||||
|  | ||||
| This will make PFERD write its current configuration to its default config file | ||||
| path. You can then run `pferd` without a command and it will execute the config | ||||
| file. Alternatively, you can use `--dump-config-to` and specify a path yourself. | ||||
| Using `--dump-config-to -` will print the configuration to stdout instead of a | ||||
| file, which is a good way to see what is actually going on when using a CLI | ||||
| command. | ||||
|  | ||||
| Another good way to see what PFERD is doing is the `--explain` option. When | ||||
| enabled, PFERD explains in detail what it is doing and why. This can help with | ||||
| debugging your own config. | ||||
|  | ||||
| If you don't want to run all crawlers from your config file, you can specify the | ||||
| crawlers you want to run with `--crawler` or `-C`, like this: | ||||
|  | ||||
| ``` | ||||
| $ pferd -C crawler1 -C crawler2 | ||||
| ``` | ||||
|  | ||||
| ## Advanced usage | ||||
|  | ||||
| PFERD supports lots of different options. For example, you can configure PFERD | ||||
| to [use your system's keyring](CONFIG.md#the-keyring-authenticator) instead of | ||||
| prompting you for your username and password. PFERD also supports | ||||
| [transformation rules](CONFIG.md#transformation-rules) that let you rename or | ||||
| exclude certain files. | ||||
|  | ||||
| For more details, see the comprehensive [config format documentation](CONFIG.md). | ||||
|  | ||||
| ## Example | ||||
|  | ||||
| This example downloads a few courses from the KIT ILIAS with a common keyring | ||||
| authenticator. It reorganizes and ignores some files. | ||||
|  | ||||
| ```ini | ||||
| [DEFAULT] | ||||
| # All paths will be relative to this. | ||||
| # The crawler output directories will be <working_dir>/Foo and <working_dir>/Bar. | ||||
| working_dir = ~/stud | ||||
| # If files vanish from ILIAS the local files are not deleted, allowing us to | ||||
| # take a look at them before deleting them ourselves. | ||||
| on_conflict = no-delete | ||||
|  | ||||
| [auth:ilias] | ||||
| type = keyring | ||||
| username = foo | ||||
|  | ||||
| [crawl:Foo] | ||||
| type = kit-ilias-web | ||||
| auth = auth:ilias | ||||
| # Crawl a course by its ID (found as `ref_id=ID` in the URL) | ||||
| target = 1234567 | ||||
|  | ||||
| # Plaintext files are easier to read by other tools | ||||
| links = plaintext | ||||
|  | ||||
| transform = | ||||
|   # Ignore unneeded folders | ||||
|   Online-Tests --> ! | ||||
|   Vorlesungswerbung --> ! | ||||
|  | ||||
|   # Rename folders | ||||
|   Lehrbücher --> Vorlesung | ||||
|   # Note the ">>" arrow head which lets us apply further rules to files moved to "Übung" | ||||
|   Übungsunterlagen -->> Übung | ||||
|  | ||||
|   # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly | ||||
|   "Übung/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf | ||||
|   # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly | ||||
|   "Übung/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf | ||||
|  | ||||
|   # The course has nested folders with the same name - flatten them | ||||
|   "Übung/(.+?)/\\1" -re-> Übung/{g1} | ||||
|  | ||||
| [crawl:Bar] | ||||
| type = kit-ilias-web | ||||
| auth = auth:ilias | ||||
| target = 1337420 | ||||
| ``` | ||||
|   | ||||
| @@ -1,342 +0,0 @@ | ||||
| #!/bin/env python3 | ||||
|  | ||||
| import re | ||||
| import sys | ||||
|  | ||||
| import PFERD | ||||
| from PFERD.utils import get_base_dir, move, rename | ||||
|  | ||||
| #PFERD.enable_logging(logging.DEBUG) | ||||
| PFERD.enable_logging() | ||||
|  | ||||
| base_dir = get_base_dir(__file__) | ||||
|  | ||||
| # Semester 1 | ||||
|  | ||||
| def gbi_filter(path): | ||||
|     # Tutorien rausfiltern | ||||
|     if path.parts[:1] == ("Tutoriumsfolien",): | ||||
|         if path.parts[1:] == (): return True | ||||
|         if path.parts[1:2] == ("Tutorium 15",): return True | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
| def gbi_transform(path): | ||||
|     # Übungsblätter in Blätter/blatt_xx.pdf | ||||
|     new_path = move(path, ("Übungsblätter",), ("Blätter",)) | ||||
|     if new_path is not None: | ||||
|  | ||||
|         match = re.match(r"(\d+).aufgaben.pdf", new_path.name) | ||||
|         if match: | ||||
|             number = int(match.group(1)) | ||||
|             return rename(new_path, f"blatt_{number:02}.pdf") | ||||
|  | ||||
|         match = re.match(r"(\d+).loesungen.pdf", new_path.name) | ||||
|         if match: | ||||
|             number = int(match.group(1)) | ||||
|             return rename(new_path, f"loesung_{number:02}.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     # Folien in Folien/* | ||||
|     new_path = move(path, ("Vorlesung: Folien",), ("Folien",)) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     # Skripte in Skripte/* | ||||
|     new_path = move(path, ("Vorlesung: Skript",), ("Skripte",)) | ||||
|     if new_path is not None: | ||||
|         if new_path.name == "k-21-relationen-skript.pdf": | ||||
|             return rename(new_path, "21-relationen-skript.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     # Übungsfolien in Übung/* | ||||
|     new_path = move(path, ("große Übung: Folien",), ("Übung",)) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     # Tutoriumsfolien in Tutorium/* | ||||
|     new_path = move(path, ("Tutoriumsfolien","Tutorium 15"), ("Tutorium",)) | ||||
|     if new_path is not None: | ||||
|         if new_path.name == "GBI_Tut_2 (1).pdf": | ||||
|             return rename(new_path, "GBI_Tut_2.pdf") | ||||
|         if new_path.name == "GBI_Tut_7 (1).pdf": | ||||
|             return rename(new_path, "GBI_Tut_7.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     return path | ||||
|  | ||||
| def hm1_transform(path): | ||||
|     match = re.match(r"blatt(\d+).pdf", path.name) | ||||
|     if match: | ||||
|         new_path = move(path, (), ("Blätter",)) | ||||
|         number = int(match.group(1)) | ||||
|         return rename(new_path, f"blatt_{number:02}.pdf") | ||||
|  | ||||
|     match = re.match(r"blatt(\d+).loesungen.pdf", path.name) | ||||
|     if match: | ||||
|         new_path = move(path, (), ("Blätter",)) | ||||
|         number = int(match.group(1)) | ||||
|         return rename(new_path, f"loesung_{number:02}.pdf") | ||||
|  | ||||
|     return path | ||||
|  | ||||
| def la1_filter(path): | ||||
|     # Tutorien rausfitern | ||||
|     if path.parts[:1] == ("Tutorien",): | ||||
|         if path.parts[1:] == (): return True | ||||
|         if path.parts[1:2] == ("Tutorium 03 - Philipp Faller",): return True | ||||
|         if path.parts[1:2] == ("Tutorium 23 - Sebastian Faller",): return True | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
| def la1_transform(path): | ||||
|     # Alle Übungsblätter in Blätter/blatt_xx.pdf | ||||
|     # Alles andere Übungsmaterial in Blätter/* | ||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) | ||||
|     if new_path is not None: | ||||
|  | ||||
|         match = re.match(r"Blatt(\d+).pdf", new_path.name) | ||||
|         if match: | ||||
|             number = int(match.group(1)) | ||||
|             return rename(new_path, f"blatt_{number:02}.pdf") | ||||
|  | ||||
|         if new_path.name == "Lösungen zu Blatt 1, Aufgabe 1 und Blatt 2, Aufgabe 4..pdf": | ||||
|             return rename(new_path, "Lösungen zu Blatt 1, Aufgabe 1 und Blatt 2, Aufgabe 4.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     # Alles Tutoriengedöns von Philipp in Tutorium/Philipp/* | ||||
|     new_path = move(path, ("Tutorien","Tutorium 03 - Philipp Faller"), ("Tutorium","Philipp")) | ||||
|     if new_path is not None: | ||||
|         if new_path.name == "tut2.pdf": | ||||
|             return rename(new_path, "Tut2.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     # Alles Tutoriengedöns von Sebastian in Tutorium/Sebastian/* | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 1"), ("Tutorium","Sebastian", "tut01")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 2", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut02.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 3", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut03.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 4", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut04.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 5", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut05.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 6", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut06.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 7", "tut7.pdf"), ("Tutorium","Sebastian", "tut07.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 8", "tut8.pdf"), ("Tutorium","Sebastian", "tut08.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 9", "tut9.pdf"), ("Tutorium","Sebastian", "tut09.pdf")) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     if path.parts == ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 10", "tut10.pdf"): return None | ||||
|  | ||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller"), ("Tutorium","Sebastian")) | ||||
|     if new_path is not None: | ||||
|         return new_path | ||||
|  | ||||
|     # Übungs-Gedöns in Übung/* | ||||
|     new_path = move(path, ("Informatikervorlesung", "Übungsfolien"), ("Übung",)) | ||||
|     if new_path is not None: | ||||
|         if new_path.name == "Übung_06_ausgewählte Folien.pdf": | ||||
|             return rename(new_path, "Übung_06_ausgewählte_Folien.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     # Vorlesungsfolien-Gedöns in Folien/* | ||||
|     new_path = move(path, ("Informatikervorlesung", "Folien.Notizen"), ("Folien",)) | ||||
|     if new_path is not None: | ||||
|         return new_path | ||||
|  | ||||
|     # Rest in Hauptverzeichnis | ||||
|     new_path = move(path, ("Informatikervorlesung",), ()) | ||||
|     if new_path is not None: | ||||
|         # Rename filenames that are invalid on FAT systems | ||||
|         if new_path.name == "Evaluationsergebnisse: Übung.pdf": | ||||
|             return rename(new_path, "Evaluationsergebnisse_Übung.pdf") | ||||
|         if new_path.name == "Skript \"Lineare Algebra\" von Stefan Kühnlein.pdf": | ||||
|             return rename(new_path, "Skript Lineare Algebra von Stefan kühnlein.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     return path | ||||
|  | ||||
| def prog_filter(path): | ||||
|     # Tutorien rausfiltern | ||||
|     if path.parts[:1] == ("Tutorien",): return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
| def prog_transform(path): | ||||
|     # Übungsblätter in Blätter/* | ||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) | ||||
|     if new_path is not None: | ||||
|         if new_path.name == "assignmen04.pdf": | ||||
|             return rename(new_path, "assignment04.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     # Folien in Folien/* | ||||
|     new_path = move(path, ("Vorlesungsmaterial",), ("Folien",)) | ||||
|     if new_path is not None: | ||||
|         if new_path.name == "00.1_Begruessung.pdf": | ||||
|             return rename(new_path, "00-01_Begruessung.pdf") | ||||
|         if new_path.name == "00.2_Organisatorisches.pdf": | ||||
|             return rename(new_path, "00-02_Organisatorisches.pdf") | ||||
|         if new_path.name == "01-01_ Einfache-Programme.pdf": | ||||
|             return rename(new_path, "01-01_Einfache_Programme.pdf") | ||||
|         if new_path.name == "13_Finden_und_ Beheben_von_Fehlern.pdf": | ||||
|             return rename(new_path, "13_Finden_und_Beheben_von_Fehlern.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     return path | ||||
|  | ||||
| # Semester 2 | ||||
|  | ||||
| def algo1_filter(path): | ||||
|     # Tutorien rausfiltern | ||||
|     if path.parts[:1] == ("Tutorien",): | ||||
|         if path.parts[1:] == (): return True | ||||
|         #if path.parts[1:2] == ("Tutorium 15",): return True | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
| def algo1_transform(path): | ||||
|     # Folien in Folien/* | ||||
|     new_path = move(path, ("Vorlesungsfolien",), ("Folien",)) | ||||
|     if new_path is not None: | ||||
|         return new_path | ||||
|  | ||||
|     return path | ||||
|  | ||||
| def hm2_transform(path): | ||||
|     match = re.match(r"blatt(\d+).pdf", path.name) | ||||
|     if match: | ||||
|         new_path = move(path, (), ("Blätter",)) | ||||
|         number = int(match.group(1)) | ||||
|         return rename(new_path, f"blatt_{number:02}.pdf") | ||||
|  | ||||
|     match = re.match(r"blatt(\d+).loesungen.pdf", path.name) | ||||
|     if match: | ||||
|         new_path = move(path, (), ("Blätter",)) | ||||
|         number = int(match.group(1)) | ||||
|         return rename(new_path, f"loesung_{number:02}.pdf") | ||||
|  | ||||
|     return path | ||||
|  | ||||
| def la2_filter(path): | ||||
|     # Tutorien rausfiltern | ||||
|     if path.parts[:1] == ("Tutorien",): | ||||
|         if path.parts[1:] == (): return True | ||||
|         #if path.parts[1:2] == ("Tutorium 15",): return True | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
| def la2_transform(path): | ||||
|     # Folien in Folien/* | ||||
|     new_path = move(path, ("Vorlesungsmaterial",), ("Folien",)) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     # Alle Übungsblätter in Blätter/blatt_xx.pdf | ||||
|     # Alles andere Übungsmaterial in Blätter/* | ||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) | ||||
|     if new_path is not None: | ||||
|  | ||||
|         match = re.match(r"Blatt(\d+).pdf", new_path.name) | ||||
|         if match: | ||||
|             number = int(match.group(1)) | ||||
|             return rename(new_path, f"blatt_{number:02}.pdf") | ||||
|  | ||||
|         return new_path | ||||
|  | ||||
|     return path | ||||
|  | ||||
| def swt1_filter(path): | ||||
|     # Tutorien rausfiltern | ||||
|     if path.parts[:1] == ("Tutorien",): | ||||
|         if path.parts[1:] == (): return True | ||||
|         #if path.parts[1:2] == ("Tutorium 15",): return True | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
| def swt1_transform(path): | ||||
|     # Folien in Folien/* | ||||
|     new_path = move(path, ("Vorlesungsmaterial",), ("Folien",)) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     # Übungsblätter in Blätter/* | ||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) | ||||
|     if new_path is not None: return new_path | ||||
|  | ||||
|     return path | ||||
|  | ||||
| # Main part of the config | ||||
|  | ||||
| def main(args): | ||||
|     args = [arg.lower() for arg in args] | ||||
|  | ||||
|     ffm = PFERD.FfM(base_dir) | ||||
|     ilias = PFERD.Ilias(base_dir, "cookie_jar") | ||||
|     norbert = PFERD.Norbert(base_dir) | ||||
|  | ||||
|     # Semester 1 | ||||
|  | ||||
|     if not args or "gbi" in args: | ||||
|         ilias.synchronize("855240", "GBI", | ||||
|                 transform=gbi_transform, filter=gbi_filter) | ||||
|  | ||||
|     if not args or "hm1" in args: | ||||
|         ffm.synchronize("iana2/lehre/hm1info2018w", "HM1", | ||||
|                 transform=hm1_transform) | ||||
|  | ||||
|     if not args or "la1" in args: | ||||
|         ilias.synchronize("874938", "LA1", | ||||
|                 transform=la1_transform, filter=la1_filter) | ||||
|  | ||||
|     if not args or "prog" in args: | ||||
|         ilias.synchronize("851237", "Prog", | ||||
|                 transform=prog_transform, filter=prog_filter) | ||||
|  | ||||
|     if not args or "norbert" in args: | ||||
|         norbert.synchronize("Prog-Tut") | ||||
|  | ||||
|     # Semester 2 | ||||
|  | ||||
|     if not args or "algo1" in args: | ||||
|         ilias.synchronize("959260", "Algo1", | ||||
|                 transform=algo1_transform, filter=algo1_filter) | ||||
|  | ||||
|     if not args or "hm2" in args: | ||||
|         ffm.synchronize("iana2/lehre/hm2info2019s", "HM2", | ||||
|                 transform=hm2_transform) | ||||
|  | ||||
|     if not args or "la2" in args: | ||||
|         ilias.synchronize("950588", "LA2", | ||||
|                 transform=la2_transform, filter=la2_filter) | ||||
|  | ||||
|     if not args or "swt1" in args: | ||||
|         ilias.synchronize("945596", "SWT1", | ||||
|                 transform=swt1_transform, filter=swt1_filter) | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     args = sys.argv[1:] | ||||
|     main(args) | ||||
							
								
								
									
										11
									
								
								mypy.ini
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								mypy.ini
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | ||||
| [mypy] | ||||
| disallow_any_generics = True | ||||
| disallow_untyped_defs = True | ||||
| disallow_incomplete_defs = True | ||||
| no_implicit_optional = True | ||||
| warn_unused_ignores = True | ||||
| warn_unreachable = True | ||||
| show_error_context = True | ||||
|  | ||||
| [mypy-rich.*,bs4,keyring] | ||||
| ignore_missing_imports = True | ||||
							
								
								
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| # File used by pyinstaller to create the executable | ||||
|  | ||||
| from PFERD.__main__ import main | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										3
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| [build-system] | ||||
| requires = ["setuptools", "wheel"] | ||||
| build-backend = "setuptools.build_meta" | ||||
							
								
								
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| pyinstaller --onefile pferd.py | ||||
							
								
								
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,111 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| import argparse | ||||
| import time | ||||
| import re | ||||
| from subprocess import run | ||||
|  | ||||
|  | ||||
| def load_changelog(): | ||||
|     with open("CHANGELOG.md") as f: | ||||
|         return list(f) | ||||
|  | ||||
|  | ||||
| def extract_changes(lines): | ||||
|     lines = iter(lines) | ||||
|     changes = [] | ||||
|  | ||||
|     # Find "Unreleased" section | ||||
|     for line in lines: | ||||
|         if line.strip() == "## Unreleased": | ||||
|             break | ||||
|     next(lines) | ||||
|  | ||||
|     # Read all lines from that section | ||||
|     for line in lines: | ||||
|         if line.startswith("## "): | ||||
|             # Found the beginning of the next section | ||||
|             break | ||||
|         elif line.startswith("### "): | ||||
|             # Found a heading in the current section | ||||
|             # Remove "#" symbols so git doesn't interpret the line as a comment later | ||||
|             changes.append(line[4:]) | ||||
|         else: | ||||
|             changes.append(line) | ||||
|  | ||||
|     # Remove trailing empty lines | ||||
|     while changes and not changes[-1].strip(): | ||||
|         changes.pop() | ||||
|  | ||||
|     return changes | ||||
|  | ||||
|  | ||||
| def update_version(version): | ||||
|     with open("PFERD/version.py") as f: | ||||
|         text = f.read() | ||||
|  | ||||
|     text = re.sub(r'VERSION = ".*"', f'VERSION = "{version}"', text) | ||||
|  | ||||
|     with open("PFERD/version.py", "w") as f: | ||||
|         f.write(text) | ||||
|  | ||||
|  | ||||
| def update_changelog(lines, version, date): | ||||
|     lines = iter(lines) | ||||
|     new_lines = [] | ||||
|  | ||||
|     # Find "Unreleased" section | ||||
|     for line in lines: | ||||
|         new_lines.append(line) | ||||
|         if line.strip() == "## Unreleased": | ||||
|             break | ||||
|  | ||||
|     # Add new heading below that | ||||
|     new_lines.append("\n") | ||||
|     new_lines.append(f"## {version} - {date}\n") | ||||
|  | ||||
|     # Add remaining lines | ||||
|     for line in lines: | ||||
|         new_lines.append(line) | ||||
|  | ||||
|     with open("CHANGELOG.md", "w") as f: | ||||
|         f.write("".join(new_lines)) | ||||
|  | ||||
|  | ||||
| def commit_changes(version): | ||||
|     run(["git", "add", "CHANGELOG.md", "PFERD/version.py"]) | ||||
|     run(["git", "commit", "-m", f"Bump version to {version}"]) | ||||
|  | ||||
|  | ||||
| def create_tag(version, annotation): | ||||
|     run(["git", "tag", "-am", annotation, f"v{version}"]) | ||||
|  | ||||
|  | ||||
| def fastforward_latest(): | ||||
|     run(["git", "branch", "-f", "latest", "HEAD"]) | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("version") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     version = args.version | ||||
|     date = time.strftime("%Y-%m-%d") | ||||
|     changelog = load_changelog() | ||||
|     changes = extract_changes(changelog) | ||||
|     annotation = f"Version {version} - {date}\n\n{''.join(changes)}" | ||||
|  | ||||
|     update_version(version) | ||||
|     update_changelog(changelog, version, date) | ||||
|     commit_changes(version) | ||||
|     create_tag(version, annotation) | ||||
|     fastforward_latest() | ||||
|  | ||||
|     print() | ||||
|     print("Now the only thing left is to publish the changes:") | ||||
|     print(f"  $ git push origin master latest v{version}") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| mypy PFERD | ||||
| flake8 PFERD | ||||
							
								
								
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| autopep8 --recursive --in-place PFERD | ||||
| isort PFERD | ||||
							
								
								
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # Updating pip and setuptools because some older versions don't recognize the | ||||
| # project setup correctly | ||||
| if [[ $1 != '--no-pip' ]]; then | ||||
|     pip install --upgrade pip | ||||
| fi | ||||
| pip install --upgrade setuptools | ||||
|  | ||||
| # Installing PFERD itself | ||||
| pip install --editable . | ||||
|  | ||||
| # Installing tools and type hints | ||||
| pip install --upgrade mypy flake8 autopep8 isort pyinstaller | ||||
| pip install --upgrade types-chardet types-certifi | ||||
							
								
								
									
										23
									
								
								setup.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								setup.cfg
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| [metadata] | ||||
| name = PFERD | ||||
| version = attr: PFERD.version.VERSION | ||||
|  | ||||
| [options] | ||||
| packages = find: | ||||
| python_requires = >=3.9 | ||||
| install_requires = | ||||
|   aiohttp>=3.8.1 | ||||
|   beautifulsoup4>=4.10.0 | ||||
|   rich>=11.0.0 | ||||
|   keyring>=23.5.0 | ||||
|   certifi>=2021.10.8 | ||||
|  | ||||
| [options.entry_points] | ||||
| console_scripts = | ||||
|   pferd = PFERD.__main__:main | ||||
|  | ||||
| [flake8] | ||||
| max_line_length = 110 | ||||
|  | ||||
| [isort] | ||||
| line_length = 110 | ||||
							
								
								
									
										16
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								setup.py
									
									
									
									
									
								
							| @@ -1,16 +0,0 @@ | ||||
| from setuptools import setup | ||||
|  | ||||
| setup( | ||||
|         name="PFERD", | ||||
|         version="1.1.6", | ||||
|         packages=["PFERD"], | ||||
|         install_requires=[ | ||||
|             "requests>=2.21.0", | ||||
|             "beautifulsoup4>=4.7.1", | ||||
|             "colorama>=0.4.1" | ||||
|         ], | ||||
| ) | ||||
|  | ||||
| # When updating the version, also: | ||||
| # - update the README.md installation instructions | ||||
| # - set a tag on the update commit | ||||
		Reference in New Issue
	
	Block a user