mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-24 18:42:32 +02:00 
			
		
		
		
	Compare commits
	
		
			694 Commits
		
	
	
		
			v1.1.1
			...
			fix/exerci
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | dd2fedf1a2 | ||
|   | 77a23265a9 | ||
|   | 4c230ef6dd | ||
|   | b305e1ce23 | ||
|   | bdf17f5c87 | ||
|   | 77fce7daf8 | ||
|   | 653bf139f0 | ||
|   | 3f60638d33 | ||
|   | b97b6fae6b | ||
|   | 477234ad0d | ||
|   | 63f25277b0 | ||
|   | c8eff04ae0 | ||
|   | edc482cdf4 | ||
|   | 72cd0f77e2 | ||
|   | be175f9347 | ||
|   | ba2833dba5 | ||
|   | 2f0e792670 | ||
|   | 5f88539f7e | ||
|   | bd9d7efe64 | ||
|   | 16a2dd5b15 | ||
|   | 678283d341 | ||
|   | 287173b0b1 | ||
|   | 712217e959 | ||
|   | 6dda4c55a8 | ||
|   | 596b6a7688 | ||
|   | 5983200247 | ||
|   | 26e802d88b | ||
|   | f5c4e82816 | ||
|   | f5273f7ca0 | ||
|   | fa71a9f44f | ||
|   | 81d6ff53c4 | ||
|   | d7a2b6e019 | ||
|   | 71c65e89d1 | ||
|   | c1046498e7 | ||
|   | 8fbd1978af | ||
|   | 739dd95850 | ||
|   | c54c3bcfa1 | ||
|   | d7f2229978 | ||
|   | 52fdeae752 | ||
|   | f9bb2e41cf | ||
|   | 4f9e2ab48d | ||
|   | 19beb8f07b | ||
|   | c897d9e2f5 | ||
|   | 21a266e302 | ||
|   | b29b6f93f8 | ||
|   | 318226d7cb | ||
|   | 422cf05f15 | ||
|   | 819c6673c7 | ||
|   | 89b44c69a7 | ||
|   | 4b4f72b2ca | ||
|   | 778517d8c6 | ||
|   | 428b0179fc | ||
|   | ade6309dd9 | ||
|   | fd6cb7b966 | ||
|   | 5c87517ceb | ||
|   | b01f093474 | ||
|   | 3a05b90525 | ||
|   | 7a00f73e0e | ||
|   | 5d0621420e | ||
|   | df98153169 | ||
|   | fc1f68ccd9 | ||
|   | 3e831c7e23 | ||
|   | bbcfe9c8dd | ||
|   | eb01aa86cb | ||
|   | 3db186a978 | ||
|   | 4a5959fd58 | ||
|   | 1cbc2b717a | ||
|   | da627ff929 | ||
|   | c1b592ac29 | ||
|   | eb0c956d32 | ||
|   | ab0cb2d956 | ||
|   | a117126389 | ||
|   | e9f8901520 | ||
|   | 266812f90e | ||
|   | 533bc27439 | ||
|   | 0113a0ca10 | ||
|   | 40f8a05ad6 | ||
|   | 50b50513c6 | ||
|   | df3514cd03 | ||
|   | ad53185247 | ||
|   | 87b67e9271 | ||
|   | b54b3b979c | ||
|   | 2184ac8040 | ||
|   | b3d412360b | ||
|   | dbc2553b11 | ||
|   | 68c398f1fe | ||
|   | 123a57beec | ||
|   | d204dac8ce | ||
|   | 443f7fe839 | ||
|   | 0294ceb7d5 | ||
|   | 6f30c6583d | ||
|   | 467fc526e8 | ||
|   | 722d2eb393 | ||
|   | 6d44aac278 | ||
|   | 55a2de6b88 | ||
|   | c0d6d8b229 | ||
|   | 635caa765d | ||
|   | e69b55b349 | ||
|   | 07200bbde5 | ||
|   | c020cccc64 | ||
|   | 259cfc20cc | ||
|   | 37b51a66d8 | ||
|   | f47d2f11d8 | ||
|   | 1b6be6bd79 | ||
|   | e1430e6298 | ||
|   | 5fdd40204b | ||
|   | fb4631ba18 | ||
|   | d72fc2760b | ||
|   | 4a51aaa4f5 | ||
|   | 66a5b1ba02 | ||
|   | aa5a3a10bc | ||
|   | d9b111cec2 | ||
|   | 345f52a1f6 | ||
|   | ed24366aba | ||
|   | 46fb782798 | ||
|   | 846c29aee1 | ||
|   | a5015fe9b1 | ||
|   | 616b0480f7 | ||
|   | 2f0e04ce13 | ||
|   | bcc537468c | ||
|   | 694ffb4d77 | ||
|   | af2cc1169a | ||
|   | bc3fa36637 | ||
|   | afbd03f777 | ||
|   | b8fe25c580 | ||
|   | a241672726 | ||
|   | a8f76e9be7 | ||
|   | b56475450d | ||
|   | aa74604d29 | ||
|   | d2e6d91880 | ||
|   | 602044ff1b | ||
|   | 31631fb409 | ||
|   | 00db348218 | ||
|   | a709280cbf | ||
|   | a99ddaa0cc | ||
|   | ba3d299c05 | ||
|   | 07a21f80a6 | ||
|   | f17b9b68f4 | ||
|   | a2831fbea2 | ||
|   | da72863b47 | ||
|   | 86e2e226dc | ||
|   | 7872fe5221 | ||
|   | 86947e4874 | ||
|   | 4f022e2d19 | ||
|   | f47e7374d2 | ||
|   | 57ec51e95a | ||
|   | 0045124a4e | ||
|   | 9618aae83b | ||
|   | 33453ede2d | ||
|   | e467b38d73 | ||
|   | e9d2d05030 | ||
|   | 4bf0c972e6 | ||
|   | 4ee919625d | ||
|   | d30f25ee97 | ||
|   | 10d9d74528 | ||
|   | 43c5453e10 | ||
|   | eb4de8ae0c | ||
|   | e32c1f000f | ||
|   | 5f527bc697 | ||
|   | ced8b9a2d0 | ||
|   | 6f3cfd4396 | ||
|   | 462d993fbc | ||
|   | a99356f2a2 | ||
|   | eac2e34161 | ||
|   | a82a0b19c2 | ||
|   | 90cb6e989b | ||
|   | 6289938d7c | ||
|   | 13b8c3d9c6 | ||
|   | 88afe64a92 | ||
|   | 6b2a657573 | ||
|   | d6f38a61e1 | ||
|   | ad3f4955f7 | ||
|   | e42ab83d32 | ||
|   | f9a3f9b9f2 | ||
|   | ef7d5ea2d3 | ||
|   | 55ea304ff3 | ||
|   | fee12b3d9e | ||
|   | 6673077397 | ||
|   | 742632ed8d | ||
|   | 544d45cbc5 | ||
|   | 86f79ff1f1 | ||
|   | ee67f9f472 | ||
|   | 8ec3f41251 | ||
|   | 89be07d4d3 | ||
|   | 91200f3684 | ||
|   | 9ffd603357 | ||
|   | 80eeb8fe97 | ||
|   | 75fde870c2 | ||
|   | 6e4d423c81 | ||
|   | 57aef26217 | ||
|   | 70ec64a48b | ||
|   | 70b33ecfd9 | ||
|   | 601e4b936b | ||
|   | a292c4c437 | ||
|   | bc65ea7ab6 | ||
|   | f28bbe6b0c | ||
|   | 61d902d715 | ||
|   | 8ab462fb87 | ||
|   | df3ad3d890 | ||
|   | fc31100a0f | ||
|   | 31b6311e99 | ||
|   | 1fc8e9eb7a | ||
|   | 85b9f45085 | ||
|   | f656e3ff34 | ||
|   | e1bda94329 | ||
|   | f6b26f4ead | ||
|   | 722970a255 | ||
|   | f40820c41f | ||
|   | 49ad1b6e46 | ||
|   | 1ce32d2f18 | ||
|   | 9d5ec84b91 | ||
|   | 1fba96abcb | ||
|   | 921cec7ddc | ||
|   | 7b062883f6 | ||
|   | 64a2960751 | ||
|   | 17879a7f69 | ||
|   | 1dd24551a5 | ||
|   | 84f775013f | ||
|   | b78eb64f3d | ||
|   | d65efed561 | ||
|   | 1ca6740e05 | ||
|   | 474aa7e1cc | ||
|   | 5beb4d9a2d | ||
|   | 19eed5bdff | ||
|   | 6fa9cfd4c3 | ||
|   | 80acc4b50d | ||
|   | 2c72a9112c | ||
|   | 17207546e9 | ||
|   | 533f75ea71 | ||
|   | adb5d4ade3 | ||
|   | a879c6ab6e | ||
|   | 915e42fd07 | ||
|   | 2d8dcc87ff | ||
|   | 66f0e398a1 | ||
|   | 30be4e29fa | ||
|   | 263780e6a3 | ||
|   | 07a75a37c3 | ||
|   | f85b75df8c | ||
|   | 6644126b5d | ||
|   | c665c36d88 | ||
|   | 519a7ef435 | ||
|   | a848194601 | ||
|   | aabce764ac | ||
|   | 5a331663e4 | ||
|   | 40144f8bd8 | ||
|   | f68849c65f | ||
|   | edb52a989e | ||
|   | 980578d05a | ||
|   | 486699cef3 | ||
|   | 0096a0c077 | ||
|   | d905e95dbb | ||
|   | 61430c8739 | ||
|   | eb8b915813 | ||
|   | 22c2259adb | ||
|   | c15a1aecdf | ||
|   | 16d50b6626 | ||
|   | 651b087932 | ||
|   | bce3dc384d | ||
|   | c21ddf225b | ||
|   | 4fefb98d71 | ||
|   | ffda4e43df | ||
|   | 69cb2a7734 | ||
|   | c33de233dc | ||
|   | 85f89a7ff3 | ||
|   | 9ce20216b5 | ||
|   | 1739c54091 | ||
|   | d8bd1f518a | ||
|   | 86ba47541b | ||
|   | 492ec6a932 | ||
|   | 342076ee0e | ||
|   | d44f6966c2 | ||
|   | 5c76193045 | ||
|   | 1c1f781be4 | ||
|   | c687d4a51a | ||
|   | fca62541ca | ||
|   | 3ab3581f84 | ||
|   | 8dd0689420 | ||
|   | be4b1040f8 | ||
|   | 79be6e1dc5 | ||
|   | edbd92dbbf | ||
|   | 27b5a8e490 | ||
|   | 1f400d5964 | ||
|   | 0ca0680165 | ||
|   | ce1dbda5b4 | ||
|   | 9cce78669f | ||
|   | 6ca0ecdf05 | ||
|   | 6e9f8fd391 | ||
|   | 2fdf24495b | ||
|   | bbf9f8f130 | ||
|   | 37f8d84a9c | ||
|   | 5edd868d5b | ||
|   | e4e5e83be6 | ||
|   | 74c7b39dc8 | ||
|   | 445dffc987 | ||
|   | d97d6bf147 | ||
|   | 79efdb56f7 | ||
|   | a9af56a5e9 | ||
|   | 59f13bb8d6 | ||
|   | 463f8830d7 | ||
|   | 05ad06fbc1 | ||
|   | 29d5a40c57 | ||
|   | c0cecf8363 | ||
|   | b998339002 | ||
|   | 245c9c3dcc | ||
|   | d8f26a789e | ||
|   | e1d18708b3 | ||
|   | b44b49476d | ||
|   | 7e0bb06259 | ||
|   | ecdedfa1cf | ||
|   | 3d4b997d4a | ||
|   | e81005ae4b | ||
|   | 33a81a5f5c | ||
|   | 25e2abdb03 | ||
|   | 803e5628a2 | ||
|   | c88f20859a | ||
|   | ec3767c545 | ||
|   | 729ff0a4c7 | ||
|   | 6fe51e258f | ||
|   | 44ecb2fbe7 | ||
|   | 53e031d9f6 | ||
|   | 8ac85ea0bd | ||
|   | adfdc302d7 | ||
|   | 3053278721 | ||
|   | 4d07de0d71 | ||
|   | 953a1bba93 | ||
|   | e724ff7c93 | ||
|   | 62f0f7bfc5 | ||
|   | 9cb2b68f09 | ||
|   | 1bbc0b705f | ||
|   | 662191eca9 | ||
|   | 8fad8edc1e | ||
|   | ae3d80664c | ||
|   | e21795ee35 | ||
|   | ec95dda18f | ||
|   | 098ac45758 | ||
|   | 9889ce6b57 | ||
|   | b4d97cd545 | ||
|   | afac22c562 | ||
|   | 552cd82802 | ||
|   | dfde0e2310 | ||
|   | 54dd2f8337 | ||
|   | b5785f260e | ||
|   | 98b8ca31fa | ||
|   | 4b104b6252 | ||
|   | 83d12fcf2d | ||
|   | e4f9560655 | ||
|   | 8cfa818f04 | ||
|   | 81301f3a76 | ||
|   | 2976b4d352 | ||
|   | 9f03702e69 | ||
|   | 3300886120 | ||
|   | 0d10752b5a | ||
|   | 92886fb8d8 | ||
|   | 5916626399 | ||
|   | a7c025fd86 | ||
|   | b7a999bc2e | ||
|   | 3851065500 | ||
|   | 4b68fa771f | ||
|   | 1525aa15a6 | ||
|   | db1219d4a9 | ||
|   | b8efcc2ca5 | ||
|   | 0bae009189 | ||
|   | 3efec53f51 | ||
|   | 8b76ebb3ef | ||
|   | 467ea3a37e | ||
|   | 2b6235dc78 | ||
|   | cd5aa61834 | ||
|   | 5ccb17622e | ||
|   | 1c226c31aa | ||
|   | 9ec0d3e16a | ||
|   | cf6903d109 | ||
|   | 9fd356d290 | ||
|   | 989032fe0c | ||
|   | 05573ccc53 | ||
|   | c454fabc9d | ||
|   | 7d323ec62b | ||
|   | c7494e32ce | ||
|   | 1123c8884d | ||
|   | e1104f888d | ||
|   | 8c32da7f19 | ||
|   | d63494908d | ||
|   | b70b62cef5 | ||
|   | 868f486922 | ||
|   | b2a2b5999b | ||
|   | 595de88d96 | ||
|   | a6fdf05ee9 | ||
|   | f897d7c2e1 | ||
|   | b0f731bf84 | ||
|   | 302b8c0c34 | ||
|   | acd674f0a0 | ||
|   | b0f9e1e8b4 | ||
|   | ed2e19a150 | ||
|   | 296a169dd3 | ||
|   | 1591cb9197 | ||
|   | 0c9167512c | ||
|   | a673ab0fae | ||
|   | 6e5fdf4e9e | ||
|   | 93a5a94dab | ||
|   | d565df27b3 | ||
|   | 961f40f9a1 | ||
|   | e3ee4e515d | ||
|   | 94d6a01cca | ||
|   | 38bb66a776 | ||
|   | 68781a88ab | ||
|   | 910462bb72 | ||
|   | 6bd6adb977 | ||
|   | 0acdee15a0 | ||
|   | c3ce6bb31c | ||
|   | 0459ed093e | ||
|   | d5f29f01c5 | ||
|   | 595ba8b7ab | ||
|   | cec0a8e1fc | ||
|   | f9b2fd60e2 | ||
|   | 60cd9873bc | ||
|   | 273d56c39a | ||
|   | 5497dd2827 | ||
|   | bbfdadc463 | ||
|   | fde811ae5a | ||
|   | 07e831218e | ||
|   | 91c33596da | ||
|   | a8dcf941b9 | ||
|   | e7a51decb0 | ||
|   | 9ec19be113 | ||
|   | f776186480 | ||
|   | 0096d83387 | ||
|   | 20a24dbcbf | ||
|   | 502654d853 | ||
|   | d2103d7c44 | ||
|   | d96a361325 | ||
|   | 2e85d26b6b | ||
|   | 6431a3fb3d | ||
|   | ac3bfd7388 | ||
|   | 3ea86d18a0 | ||
|   | bbc792f9fb | ||
|   | 7e127cd5cc | ||
|   | c4fb92c658 | ||
|   | 8da1ac6cee | ||
|   | a18db57e6f | ||
|   | b915e393dd | ||
|   | 3a74c23d09 | ||
|   | fbebc46c58 | ||
|   | 5595a908d8 | ||
|   | 27e4abcfa3 | ||
|   | c1ab7485e2 | ||
|   | 29cd5d1a3c | ||
|   | 6d5d9333ad | ||
|   | 7cc40595dc | ||
|   | 80ae5ddfaa | ||
|   | 4f480d117e | ||
|   | 1f2af3a290 | ||
|   | 14cdfb6a69 | ||
|   | e2bf84392b | ||
|   | 946b7a7931 | ||
|   | 9a9018751e | ||
|   | 83b75e8254 | ||
|   | 35c3fa205d | ||
|   | 0b606f02fa | ||
|   | fb78a6e98e | ||
|   | 5de68a0400 | ||
|   | f0562049b6 | ||
|   | 0e1077bb50 | ||
|   | c978e9edf4 | ||
|   | 2714ac6be6 | ||
|   | 9b048a9cfc | ||
|   | 1c2b6bf994 | ||
|   | ee39aaf08b | ||
|   | 93e6329901 | ||
|   | f47b137b59 | ||
|   | 83ea15ee83 | ||
|   | 75471c46d1 | ||
|   | 1e0343bba6 | ||
|   | 0f5e55648b | ||
|   | 57259e21f4 | ||
|   | 4ce385b262 | ||
|   | 2d64409542 | ||
|   | fcb3884a8f | ||
|   | 9f6dc56a7b | ||
|   | 56ab473611 | ||
|   | 6426060804 | ||
|   | 49a0ca7a7c | ||
|   | f3a4663491 | ||
|   | ecdbca8fb6 | ||
|   | 9cbea5fe06 | ||
|   | ba3c7f85fa | ||
|   | ba9215ebe8 | ||
|   | 8ebf0eab16 | ||
|   | cd90a60dee | ||
|   | 98834c9c95 | ||
|   | 55e9e719ad | ||
|   | a0ae9aee27 | ||
|   | 1486a63854 | ||
|   | 733e1ae136 | ||
|   | 4ac51048c1 | ||
|   | f2aba970fd | ||
|   | 9c4759103a | ||
|   | 316b9d7bf4 | ||
|   | 6f30adcd22 | ||
|   | 6f78fef604 | ||
|   | f830b42a36 | ||
|   | ef343dec7c | ||
|   | 0da2fafcd8 | ||
|   | f4abe3197c | ||
|   | 38d4f5b4c9 | ||
|   | 9ea03bda3e | ||
|   | 07de5bea8b | ||
|   | f0d572c110 | ||
|   | 076067e22d | ||
|   | ebb6e63c5c | ||
|   | 0c3f35a2d2 | ||
|   | 521890ae78 | ||
|   | 3f7c73df80 | ||
|   | 43100f69d5 | ||
|   | d73c778b0a | ||
|   | 73c3eb0984 | ||
|   | a519cbe05d | ||
|   | b3ad9783c4 | ||
|   | c1ccb6c53e | ||
|   | 51a713fa04 | ||
|   | 74ea039458 | ||
|   | aaa6a2b6a4 | ||
|   | e32a49480b | ||
|   | be65051f9d | ||
|   | 3387bc5f20 | ||
|   | 3f0ae729d6 | ||
|   | 8e8c1c031a | ||
|   | 55678d7fee | ||
|   | a57ee8b96b | ||
|   | e367da925e | ||
|   | 77a109bb7e | ||
|   | a3e1864a26 | ||
|   | 41cbcc509c | ||
|   | 77874b432b | ||
|   | 5c4c785e60 | ||
|   | 2aed4f6d1f | ||
|   | 34152fbe54 | ||
|   | 4047fe78f3 | ||
|   | c28347122e | ||
|   | 5b38ab8cf1 | ||
|   | bb25d32f03 | ||
|   | ecaedea709 | ||
|   | f05d1b1261 | ||
|   | 6aaa3071f9 | ||
|   | c26c9352f1 | ||
|   | d9ea688145 | ||
|   | e8be6e498e | ||
|   | e4b1fac045 | ||
|   | 402ae81335 | ||
|   | 52f31e2783 | ||
|   | 739522a151 | ||
|   | 6c034209b6 | ||
|   | f6fbd5e4bb | ||
|   | 7024db1f13 | ||
|   | 23bfa42a0d | ||
|   | fdb57884ed | ||
|   | f614b95a00 | ||
|   | 8198c9ecaa | ||
|   | 086b15d10f | ||
|   | 9d6ce331a5 | ||
|   | 821c7ade26 | ||
|   | b969a1854a | ||
|   | 62535b4452 | ||
|   | c0056e5669 | ||
|   | cfe4a8fc0a | ||
|   | 95b9248a25 | ||
|   | 1004fa40f8 | ||
|   | e8ddb0ca04 | ||
|   | 36c8785f15 | ||
|   | 03a801eecc | ||
|   | 072c6630bf | ||
|   | 4f56c8f192 | ||
|   | 4fdb67128d | ||
|   | a0f9d31d94 | ||
|   | e7b08420ba | ||
|   | c1b21f7772 | ||
|   | 9850ab1d73 | ||
|   | 9950144e97 | ||
|   | f6faacabb0 | ||
|   | 19c1e3ac6f | ||
|   | afa48c2d2d | ||
|   | a4c518bf4c | ||
|   | 057135022f | ||
|   | 755e9aa0d3 | ||
|   | c9deca19ca | ||
|   | bb048c3a6d | ||
|   | 33fcd307b2 | ||
|   | a0c5572b59 | ||
|   | 2d20d2934c | ||
|   | 2c48ab66d4 | ||
|   | 104b838aed | ||
|   | 7f10931be8 | ||
|   | 07c225bc20 | ||
|   | 56f2394001 | ||
|   | fdff8bc40e | ||
|   | bee3d70998 | ||
|   | 42345ecc61 | ||
|   | 920d521d68 | ||
|   | e0b46a306a | ||
|   | 8a42a2a396 | ||
|   | 80247400a4 | ||
|   | 13c5a29ff0 | ||
|   | 1aaa6e7ab5 | ||
|   | 7f53543324 | ||
|   | 292e516297 | ||
|   | 8258fa8919 | ||
|   | 5b929f09a2 | ||
|   | 4d32f863bc | ||
|   | 4e7333b396 | ||
|   | 4c0e3b493a | ||
|   | 2de079a5d3 | ||
|   | 509e624d47 | ||
|   | ca8fcf7a1d | ||
|   | 980f69b5af | ||
|   | 0b00a9c26b | ||
|   | 1ef85c45e5 | ||
|   | 5ef5a56e69 | ||
|   | f3f4be2690 | ||
|   | 076b8c5a1f | ||
|   | 13bc78c889 | ||
|   | dc964a9d98 | ||
|   | c2b14f3db9 | ||
|   | 4b59a7c375 | ||
|   | 3a57430893 | ||
|   | bef210ae77 | ||
|   | ea005517cf | ||
|   | 3841f27aab | ||
|   | df0eb84a44 | ||
|   | 2de4255a78 | ||
|   | 3c808879c9 | ||
|   | a051e3bcca | ||
|   | eb7df036df | ||
|   | 23db59e733 | ||
|   | ac65b06a8e | ||
|   | 8891041069 | ||
|   | 70d63e3e90 | ||
|   | b2a7af2e3e | ||
|   | 23bed48c8c | ||
|   | 0926d33798 | ||
|   | 55ba2f4070 | ||
|   | d18b48aaf4 | ||
|   | 4ef0ffe3bf | ||
|   | ce77995c8f | ||
|   | ed9245c14d | ||
|   | 01e6972c96 | ||
|   | 8181ae5b17 | ||
|   | 6407190ae0 | ||
|   | 87395faac2 | ||
|   | a9e6e7883d | ||
|   | 154d6b29dd | ||
|   | 62ac569ec4 | ||
|   | 9f1a0a58ab | ||
|   | 879a2c7c80 | ||
|   | ff06c5215e | ||
|   | 135a8dce4b | ||
|   | 63bbcad918 | ||
|   | 6584d6a905 | ||
|   | 5990098ef8 | ||
|   | f3d3d6bb65 | ||
|   | b2fe7cc064 | ||
|   | 930d821dd7 | ||
|   | 5c2ff14839 | ||
|   | a3d6dc7873 | ||
|   | 53ad1c924b | ||
|   | 8c431c7d81 | ||
|   | d5dd5aac06 | ||
|   | 7d48972967 | ||
|   | 25043a4aaa | ||
|   | 7ebeef5873 | ||
|   | 9b658776ca | ||
|   | cf3553175f | ||
|   | bf8b3cf9f7 | ||
|   | 4a5600d5ce | ||
|   | f5bc49160f | ||
|   | fa5f82d312 | ||
|   | 4433696509 | ||
|   | 1f5475abc5 | ||
|   | 1407c6d264 | ||
|   | e152bfc4a3 | ||
|   | 1973c931bd | ||
|   | 458cc1c6d6 | ||
|   | 52852d11a6 | ||
|   | f94629a7fa | ||
|   | c8ee456d33 | ||
|   | 2752e98621 | ||
|   | 1572e11da8 | ||
|   | ea01dc7cb2 | ||
|   | aba8d46d26 | ||
|   | 77056e6f8d | ||
|   | 064f12c14c | ||
|   | 2eb834afc3 | ||
|   | d468a45662 | ||
|   | 50e25346e5 | ||
|   | 67da4e69fa | ||
|   | da602366f8 | ||
|   | 2016f61bf8 | 
							
								
								
									
										10
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | |||||||
|  | version: 2 | ||||||
|  | updates: | ||||||
|  |   - package-ecosystem: github-actions | ||||||
|  |     directory: / | ||||||
|  |     schedule: | ||||||
|  |       interval: monthly | ||||||
|  |     groups: | ||||||
|  |       gh-actions: | ||||||
|  |         patterns: | ||||||
|  |           - "*" | ||||||
							
								
								
									
										85
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,85 @@ | |||||||
|  | name: build-and-release | ||||||
|  |  | ||||||
|  | on: [push, pull_request] | ||||||
|  |  | ||||||
|  | defaults: | ||||||
|  |   run: | ||||||
|  |     shell: bash | ||||||
|  |  | ||||||
|  | jobs: | ||||||
|  |  | ||||||
|  |   build: | ||||||
|  |     runs-on: ${{ matrix.os }} | ||||||
|  |     strategy: | ||||||
|  |       fail-fast: false | ||||||
|  |       matrix: | ||||||
|  |         os: [ubuntu-latest, windows-latest, macos-13, macos-latest] | ||||||
|  |         python: ["3.11"] | ||||||
|  |     steps: | ||||||
|  |       - uses: actions/checkout@v4 | ||||||
|  |  | ||||||
|  |       - uses: actions/setup-python@v5 | ||||||
|  |         with: | ||||||
|  |           python-version: ${{ matrix.python }} | ||||||
|  |  | ||||||
|  |       - name: Set up project | ||||||
|  |         if: matrix.os != 'windows-latest' | ||||||
|  |         run: ./scripts/setup | ||||||
|  |  | ||||||
|  |       - name: Set up project on windows | ||||||
|  |         if: matrix.os == 'windows-latest' | ||||||
|  |         # For some reason, `pip install --upgrade pip` doesn't work on | ||||||
|  |         # 'windows-latest'. The installed pip version works fine however. | ||||||
|  |         run: ./scripts/setup --no-pip | ||||||
|  |  | ||||||
|  |       - name: Run checks | ||||||
|  |         run: | | ||||||
|  |           ./scripts/check | ||||||
|  |           ./scripts/format | ||||||
|  |  | ||||||
|  |       - name: Assert no changes | ||||||
|  |         run: git diff --exit-code | ||||||
|  |  | ||||||
|  |       - name: Build | ||||||
|  |         run: ./scripts/build | ||||||
|  |  | ||||||
|  |       - name: Rename binary | ||||||
|  |         # Glob in source location because on windows pyinstaller creates a file | ||||||
|  |         # named "pferd.exe" | ||||||
|  |         run: mv dist/pferd* dist/pferd-${{ matrix.os }} | ||||||
|  |  | ||||||
|  |       - name: Upload binary | ||||||
|  |         uses: actions/upload-artifact@v4 | ||||||
|  |         with: | ||||||
|  |           name: pferd-${{ matrix.os }} | ||||||
|  |           path: dist/pferd-${{ matrix.os }} | ||||||
|  |  | ||||||
|  |   release: | ||||||
|  |     runs-on: ubuntu-latest | ||||||
|  |     if: startsWith(github.ref, 'refs/tags/v') | ||||||
|  |     needs: build | ||||||
|  |     steps: | ||||||
|  |  | ||||||
|  |       - name: Download binaries | ||||||
|  |         uses: actions/download-artifact@v4 | ||||||
|  |         with: | ||||||
|  |           pattern: pferd-* | ||||||
|  |           merge-multiple: true | ||||||
|  |  | ||||||
|  |       - name: Rename binaries | ||||||
|  |         run: | | ||||||
|  |           mv pferd-ubuntu-latest pferd-linux | ||||||
|  |           mv pferd-windows-latest pferd-windows.exe | ||||||
|  |           mv pferd-macos-13 pferd-mac-x86_64 | ||||||
|  |           mv pferd-macos-latest pferd-mac | ||||||
|  |  | ||||||
|  |       - name: Create release | ||||||
|  |         uses: softprops/action-gh-release@v2 | ||||||
|  |         env: | ||||||
|  |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||||
|  |         with: | ||||||
|  |           files: | | ||||||
|  |             pferd-linux | ||||||
|  |             pferd-windows.exe | ||||||
|  |             pferd-mac | ||||||
|  |             pferd-mac-x86_64 | ||||||
							
								
								
									
										19
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										19
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,12 +1,11 @@ | |||||||
| # python stuff | .mypy_cache/ | ||||||
|  | /.venv/ | ||||||
|  | /PFERD.egg-info/ | ||||||
| __pycache__/ | __pycache__/ | ||||||
|  | /.vscode/ | ||||||
|  | /.idea/ | ||||||
|  |  | ||||||
| # venv stuff | # pyinstaller | ||||||
| bin/ | /pferd.spec | ||||||
| include/ | /build/ | ||||||
| lib/ | /dist/ | ||||||
| lib64 |  | ||||||
| pyvenv.cfg |  | ||||||
|  |  | ||||||
| .tmp/ |  | ||||||
| pip-selfcheck.json |  | ||||||
|   | |||||||
							
								
								
									
										290
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								CHANGELOG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,290 @@ | |||||||
|  | # Changelog | ||||||
|  |  | ||||||
|  | All notable changes to this project will be documented in this file. The format | ||||||
|  | is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). | ||||||
|  |  | ||||||
|  | This project has its own custom versioning scheme. Version numbers consist of | ||||||
|  | three parts (e. g. `3.1.5`). | ||||||
|  | - The first number is increased on major rewrites or changes. What classifies as | ||||||
|  |   a major change is up to the maintainers. This is pretty rare and a PFERD | ||||||
|  |   version 4 should hopefully not be necessary. | ||||||
|  | - The second number is increased on backwards-incompatible changes in behaviour. | ||||||
|  |   This refers to any change that would make an existing setup behave differently | ||||||
|  |   (e. g. renaming options or changing crawler behaviour). If this number is | ||||||
|  |   increased, it may be necessary for you to adapt your own setup. | ||||||
|  | - The third number is increased on backwards-compatible changes (e. g. adding | ||||||
|  |   new options or commands, changing documentation, fixing bugs). Updates that | ||||||
|  |   only increase this number should be safe and not require manual intervention. | ||||||
|  |  | ||||||
|  | We will try to correctly classify changes as backwards-compatible or | ||||||
|  | backwards-incompatible, but may occasionally make mistakes or stumble across | ||||||
|  | ambiguous situations. | ||||||
|  |  | ||||||
|  | ## Unreleased | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - Crawling of exercises with instructions | ||||||
|  |  | ||||||
|  | ## 3.8.2 - 2025-04-29 | ||||||
|  |  | ||||||
|  | ## Changed | ||||||
|  | - Explicitly mention that wikis are not supported at the moment and ignore them | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - Ilias-native login | ||||||
|  | - Exercise crawling | ||||||
|  |  | ||||||
|  | ## 3.8.1 - 2025-04-17 | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - Description html files now specify at UTF-8 encoding | ||||||
|  | - Images in descriptions now always have a white background | ||||||
|  |  | ||||||
|  | ## 3.8.0 - 2025-04-16 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Support for ILIAS 9 | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Added prettier CSS to forum threads | ||||||
|  | - Downloaded forum threads now link to the forum instead of the ILIAS thread | ||||||
|  | - Increase minimum supported Python version to 3.11 | ||||||
|  | - Do not crawl nested courses (courses linked in other courses) | ||||||
|  |  | ||||||
|  | ## Fixed | ||||||
|  | - File links in report on Windows | ||||||
|  | - TOTP authentication in KIT Shibboleth | ||||||
|  | - Forum crawling only considering the first 20 entries | ||||||
|  |  | ||||||
|  | ## 3.7.0 - 2024-11-13 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Support for MOB videos in page descriptions | ||||||
|  | - Clickable links in the report to directly open new/modified/not-deleted files | ||||||
|  | - Support for non KIT shibboleth login | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Remove videos from description pages | ||||||
|  | - Perform ILIAS cycle detection after processing the transform to allow | ||||||
|  |   ignoring duplicated elements | ||||||
|  | - Parse headings (h1-h3) as folders in kit-ipd crawler | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Personal desktop/dashboard/favorites crawling | ||||||
|  | - Crawling of nested courses | ||||||
|  | - Downloading of links with no target URL | ||||||
|  | - Handle row flex on description pages | ||||||
|  | - Add `<!DOCTYPE html>` heading to forum threads to fix mime type detection | ||||||
|  | - Handle groups in cards | ||||||
|  |  | ||||||
|  | ## 3.6.0 - 2024-10-23 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Generic `ilias-web` crawler and `ilias-web` CLI command | ||||||
|  | - Support for the course overview page. Using this URL as a target might cause | ||||||
|  |   duplication warnings, as subgroups are listed separately. | ||||||
|  | - Support for named capture groups in regex transforms | ||||||
|  | - Crawl custom item groups as folders | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Normalization of meeting names in cards | ||||||
|  | - Sanitization of slashes in exercise container names | ||||||
|  |  | ||||||
|  | ## 3.5.2 - 2024-04-14 | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crawling of personal desktop with ILIAS 8 | ||||||
|  | - Crawling of empty personal desktops | ||||||
|  |  | ||||||
|  | ## 3.5.1 - 2024-04-09 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Support for ILIAS 8 | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Video name deduplication | ||||||
|  |  | ||||||
|  | ## 3.5.0 - 2023-09-13 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `no-delete-prompt-override` conflict resolution strategy | ||||||
|  | - Support for ILIAS learning modules | ||||||
|  | - `show_not_deleted` option to stop printing the "Not Deleted" status or report | ||||||
|  |   message. This combines nicely with the `no-delete-prompt-override` strategy, | ||||||
|  |   causing PFERD to mostly ignore local-only files. | ||||||
|  | - Support for mediacast video listings | ||||||
|  | - Crawling of files in info tab | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Remove size suffix for files in content pages | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crawling of courses with the timeline view as the default tab | ||||||
|  | - Crawling of file and custom opencast cards | ||||||
|  | - Crawling of button cards without descriptions | ||||||
|  | - Abort crawling when encountering an unexpected ilias root page redirect | ||||||
|  | - Sanitize ascii control characters on Windows | ||||||
|  | - Crawling of paginated past meetings | ||||||
|  | - Ignore SCORM learning modules | ||||||
|  |  | ||||||
|  | ## 3.4.3 - 2022-11-29 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Missing documentation for `forums` option | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Clear up error message shown when multiple paths are found to an element | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - IPD crawler unnecessarily appending trailing slashes | ||||||
|  | - Crawling opencast when ILIAS is set to English | ||||||
|  |  | ||||||
|  | ## 3.4.2 - 2022-10-26 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Recognize and crawl content pages in cards | ||||||
|  | - Recognize and ignore surveys | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Forum crawling crashing when a thread has no messages at all | ||||||
|  | - Forum crawling crashing when a forum has no threads at all | ||||||
|  | - Ilias login failing in some cases | ||||||
|  | - Crawling of paginated future meetings | ||||||
|  | - IPD crawler handling of URLs without trailing slash | ||||||
|  |  | ||||||
|  | ## 3.4.1 - 2022-08-17 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Download of page descriptions | ||||||
|  | - Forum download support | ||||||
|  | - `pass` authenticator | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Add `cpp` extension to default `link_regex` of IPD crawler | ||||||
|  | - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option | ||||||
|  | - Simplify default IPD crawler `link_regex` | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - IPD crawler crashes on some sites | ||||||
|  | - Meeting name normalization for yesterday, today and tomorrow | ||||||
|  | - Crawling of meeting file previews | ||||||
|  | - Login with new login button html layout | ||||||
|  | - Descriptions for courses are now placed in the correct subfolder when | ||||||
|  |   downloading the whole desktop | ||||||
|  |  | ||||||
|  | ## 3.4.0 - 2022-05-01 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Message when Shibboleth entitlements need to be manually reviewed | ||||||
|  | - Links to unofficial packages and repology in the readme | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Increase minimum supported Python version to 3.9 | ||||||
|  | - Support video listings with more columns | ||||||
|  | - Use UTF-8 when reading/writing the config file | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crash during authentication when the Shibboleth session is still valid | ||||||
|  |  | ||||||
|  | ## 3.3.1 - 2022-01-15 | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - ILIAS login | ||||||
|  | - Local video cache if `windows_paths` is enabled | ||||||
|  |  | ||||||
|  | ## 3.3.0 - 2022-01-09 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - A KIT IPD crawler | ||||||
|  | - Support for ILIAS cards | ||||||
|  | - (Rudimentary) support for content pages | ||||||
|  | - Support for multi-stream videos | ||||||
|  | - Support for ILIAS 7 | ||||||
|  |  | ||||||
|  | ### Removed | ||||||
|  | - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Crawling of recursive courses | ||||||
|  | - Crawling files directly placed on the personal desktop | ||||||
|  | - Ignore timestamps at the unix epoch as they crash on windows | ||||||
|  |  | ||||||
|  | ## 3.2.0 - 2021-08-04 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `--skip` command line option | ||||||
|  | - Support for ILIAS booking objects | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Using multiple path segments on left side of `-name->` now results in an | ||||||
|  |   error. This was already forbidden by the documentation but silently accepted | ||||||
|  |   by PFERD. | ||||||
|  | - More consistent path printing in some `--explain` messages | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Nondeterministic name deduplication due to ILIAS reordering elements | ||||||
|  | - More exceptions are handled properly | ||||||
|  |  | ||||||
|  | ## 3.1.0 - 2021-06-13 | ||||||
|  |  | ||||||
|  | If your config file doesn't do weird things with transforms, it should continue | ||||||
|  | to work. If your `-re->` arrows behave weirdly, try replacing them with | ||||||
|  | `-exact-re->` arrows. If you're on Windows, you might need to switch from `\` | ||||||
|  | path separators to `/` in your regex rules. | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `skip` option for crawlers | ||||||
|  | - Rules with `>>` instead of `>` as arrow head | ||||||
|  | - `-exact-re->` arrow (behaves like `-re->` did previously) | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - The `-re->` arrow can now rename directories (like `-->`) | ||||||
|  | - Use `/` instead of `\` as path separator for (regex) rules on Windows | ||||||
|  | - Use the label to the left for exercises instead of the button name to | ||||||
|  |   determine the folder name | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Video pagination handling in ILIAS crawler | ||||||
|  |  | ||||||
|  | ## 3.0.1 - 2021-06-01 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - `credential-file` authenticator | ||||||
|  | - `--credential-file` option for `kit-ilias-web` command | ||||||
|  | - Warning if using concurrent tasks with `kit-ilias-web` | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Cookies are now stored in a text-based format | ||||||
|  |  | ||||||
|  | ### Fixed | ||||||
|  | - Date parsing now also works correctly in non-group exercises | ||||||
|  |  | ||||||
|  | ## 3.0.0 - 2021-05-31 | ||||||
|  |  | ||||||
|  | ### Added | ||||||
|  | - Proper config files | ||||||
|  | - Concurrent crawling | ||||||
|  | - Crawl external ILIAS links | ||||||
|  | - Crawl uploaded exercise solutions | ||||||
|  | - Explain what PFERD is doing and why (`--explain`) | ||||||
|  | - More control over output (`--status`, `--report`) | ||||||
|  | - Debug transform rules with `--debug-transforms` | ||||||
|  | - Print report after exiting via Ctrl+C | ||||||
|  | - Store crawler reports in `.report` JSON file | ||||||
|  | - Extensive config file documentation (`CONFIG.md`) | ||||||
|  | - Documentation for developers (`DEV.md`) | ||||||
|  | - This changelog | ||||||
|  |  | ||||||
|  | ### Changed | ||||||
|  | - Rewrote almost everything | ||||||
|  | - Better error messages | ||||||
|  | - Redesigned CLI | ||||||
|  | - Redesigned transform rules | ||||||
|  | - ILIAS crawling logic (paths may be different) | ||||||
|  | - Better support for weird paths on Windows | ||||||
|  | - Set user agent (`PFERD/<version>`) | ||||||
|  |  | ||||||
|  | ### Removed | ||||||
|  | - Backwards compatibility with 2.x | ||||||
|  | - Python files as config files | ||||||
|  | - Some types of crawlers | ||||||
							
								
								
									
										537
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										537
									
								
								CONFIG.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,537 @@ | |||||||
|  | # Config file format | ||||||
|  |  | ||||||
|  | A config file consists of sections. A section begins with a `[section]` header, | ||||||
|  | which is followed by a list of `key = value` pairs. Comments must be on their | ||||||
|  | own line and start with `#`. Multiline values must be indented beyond their key. | ||||||
|  | Boolean values can be `yes` or `no`. For more details and some examples on the | ||||||
|  | format, see the [configparser documentation][cp-file] | ||||||
|  | ([interpolation][cp-interp] is disabled). | ||||||
|  |  | ||||||
|  | [cp-file]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure" | ||||||
|  | [cp-interp]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values" | ||||||
|  |  | ||||||
|  | ## The `DEFAULT` section | ||||||
|  |  | ||||||
|  | This section contains global configuration values. It can also be used to set | ||||||
|  | default values for the other sections. | ||||||
|  |  | ||||||
|  | - `working_dir`: The directory PFERD operates in. Set to an absolute path to | ||||||
|  |   make PFERD operate the same regardless of where it is executed from. All other | ||||||
|  |   paths in the config file are interpreted relative to this path. If this path | ||||||
|  |   is relative, it is interpreted relative to the script's working dir. `~` is | ||||||
|  |   expanded to the current user's home directory. (Default: `.`) | ||||||
|  | - `explain`: Whether PFERD should log and explain its actions and decisions in | ||||||
|  |   detail. (Default: `no`) | ||||||
|  | - `status`: Whether PFERD should print status updates (like `Crawled ...`, | ||||||
|  |   `Added ...`) while running a crawler. (Default: `yes`) | ||||||
|  | - `report`: Whether PFERD should print a report of added, changed and deleted | ||||||
|  |    local files for all crawlers before exiting. (Default: `yes`) | ||||||
|  | - `show_not_deleted`: Whether PFERD should print messages in status and report | ||||||
|  |    when a local-only file wasn't deleted. Combines nicely with the | ||||||
|  |    `no-delete-prompt-override` conflict resolution strategy. | ||||||
|  | - `share_cookies`: Whether crawlers should share cookies where applicable. For | ||||||
|  |   example, some crawlers share cookies if they crawl the same website using the | ||||||
|  |   same account. (Default: `yes`) | ||||||
|  |  | ||||||
|  | ## The `crawl:*` sections | ||||||
|  |  | ||||||
|  | Sections whose names start with `crawl:` are used to configure crawlers. The | ||||||
|  | rest of the section name specifies the name of the crawler. | ||||||
|  |  | ||||||
|  | A crawler synchronizes a remote resource to a local directory. There are | ||||||
|  | different types of crawlers for different kinds of resources, e.g. ILIAS | ||||||
|  | courses or lecture websites. | ||||||
|  |  | ||||||
|  | Each crawl section represents an instance of a specific type of crawler. The | ||||||
|  | `type` option is used to specify the crawler type. The crawler's name is usually | ||||||
|  | used as the output directory. New crawlers can be created simply by adding a new | ||||||
|  | crawl section to the config file. | ||||||
|  |  | ||||||
|  | Depending on a crawler's type, it may have different options. For more details, | ||||||
|  | see the type's [documentation](#crawler-types) below. The following options are | ||||||
|  | common to all crawlers: | ||||||
|  |  | ||||||
|  | - `type`: The available types are specified in [this section](#crawler-types). | ||||||
|  | - `skip`: Whether the crawler should be skipped during normal execution. The | ||||||
|  |   crawler can still be executed manually using the `--crawler` or `-C` flags. | ||||||
|  |   (Default: `no`) | ||||||
|  | - `output_dir`: The directory the crawler synchronizes files to. A crawler will | ||||||
|  |   never place any files outside this directory. (Default: the crawler's name) | ||||||
|  | - `redownload`: When to download a file that is already present locally. | ||||||
|  |   (Default: `never-smart`) | ||||||
|  |     - `never`: If a file is present locally, it is not downloaded again. | ||||||
|  |     - `never-smart`: Like `never`, but PFERD tries to detect if an already | ||||||
|  |       downloaded files has changed via some (unreliable) heuristics. | ||||||
|  |     - `always`: All files are always downloaded, regardless of whether they are | ||||||
|  |       already present locally. | ||||||
|  |     - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary | ||||||
|  |       downloads via some (unreliable) heuristics. | ||||||
|  | - `on_conflict`: What to do when the local and remote versions of a file or | ||||||
|  |   directory differ, including when a file is replaced by a directory or a | ||||||
|  |   directory by a file. (Default: `prompt`) | ||||||
|  |     - `prompt`: Always ask the user before overwriting or deleting local files | ||||||
|  |       and directories. | ||||||
|  |     - `local-first`: Always keep the local file or directory. Equivalent to | ||||||
|  |       using `prompt` and always choosing "no". Implies that `redownload` is set | ||||||
|  |       to `never`. | ||||||
|  |     - `remote-first`: Always keep the remote file or directory. Equivalent to | ||||||
|  |       using `prompt` and always choosing "yes". | ||||||
|  |     - `no-delete`: Never delete local files, but overwrite local files if the | ||||||
|  |       remote file is different. | ||||||
|  |     - `no-delete-prompt-overwrite`: Never delete local files, but prompt to | ||||||
|  |       overwrite local files if the remote file is different. Combines nicely | ||||||
|  |       with the `show_not_deleted` option. | ||||||
|  | - `transform`: Rules for renaming and excluding certain files and directories. | ||||||
|  |   For more details, see [this section](#transformation-rules). (Default: empty) | ||||||
|  | - `tasks`: The maximum number of concurrent tasks (such as crawling or | ||||||
|  |   downloading). (Default: `1`) | ||||||
|  | - `downloads`: How many of those tasks can be download tasks at the same time. | ||||||
|  |   Must not be greater than `tasks`. (Default: Same as `tasks`) | ||||||
|  | - `task_delay`: Time (in seconds) that the crawler should wait between | ||||||
|  |   subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary | ||||||
|  |   load for the crawl target. (Default: `0.0`) | ||||||
|  | - `windows_paths`: Whether PFERD should find alternative names for paths that | ||||||
|  |   are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) | ||||||
|  |  | ||||||
|  | Some crawlers may also require credentials for authentication. To configure how | ||||||
|  | the crawler obtains its credentials, the `auth` option is used. It is set to the | ||||||
|  | full name of an auth section (including the `auth:` prefix). | ||||||
|  |  | ||||||
|  | Here is a simple example: | ||||||
|  |  | ||||||
|  | ```ini | ||||||
|  | [auth:example] | ||||||
|  | type = simple | ||||||
|  | username = foo | ||||||
|  | password = bar | ||||||
|  |  | ||||||
|  | [crawl:something] | ||||||
|  | type = some-complex-crawler | ||||||
|  | auth = auth:example | ||||||
|  | on_conflict = no-delete | ||||||
|  | tasks = 3 | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ## The `auth:*` sections | ||||||
|  |  | ||||||
|  | Sections whose names start with `auth:` are used to configure authenticators. An | ||||||
|  | authenticator provides a username and a password to one or more crawlers. | ||||||
|  |  | ||||||
|  | Authenticators work similar to crawlers: A section represents an authenticator | ||||||
|  | instance whose name is the rest of the section name. The type is specified by | ||||||
|  | the `type` option. | ||||||
|  |  | ||||||
|  | Depending on an authenticator's type, it may have different options. For more | ||||||
|  | details, see the type's [documentation](#authenticator-types) below. The only | ||||||
|  | option common to all authenticators is `type`: | ||||||
|  |  | ||||||
|  | - `type`: The types are specified in [this section](#authenticator-types). | ||||||
|  |  | ||||||
|  | ## Crawler types | ||||||
|  |  | ||||||
|  | ### The `local` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls a local directory. It is really simple and mostly useful for | ||||||
|  | testing different setups. The various delay options are meant to make the | ||||||
|  | crawler simulate a slower, network-based crawler. | ||||||
|  |  | ||||||
|  | - `target`: Path to the local directory to crawl. (Required) | ||||||
|  | - `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests. | ||||||
|  |   (Default: `0.0`) | ||||||
|  | - `download_delay`: Artificial delay (in seconds) to simulate for download | ||||||
|  |   requests. (Default: `0.0`) | ||||||
|  | - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) | ||||||
|  |  | ||||||
|  | ### The `kit-ipd` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls a KIT-IPD page by url. The root page can be crawled from | ||||||
|  | outside the KIT network so you will be informed about any new/deleted files, | ||||||
|  | but downloading files requires you to be within. Adding a short delay between | ||||||
|  | requests is likely a good idea. | ||||||
|  |  | ||||||
|  | - `target`: URL to a KIT-IPD page | ||||||
|  | - `link_regex`: A regex that is matched against the `href` part of links. If it | ||||||
|  |   matches, the given link is downloaded as a file. This is used to extract | ||||||
|  |   files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) | ||||||
|  |  | ||||||
|  | ### The `ilias-web` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls a generic ILIAS instance. | ||||||
|  |  | ||||||
|  | Inspired by [this ILIAS downloader][ilias-dl], the following configurations should work | ||||||
|  | out of the box for the corresponding universities: | ||||||
|  |  | ||||||
|  | [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" | ||||||
|  |  | ||||||
|  | | University    | `base_url`                              | `login_type` | `client_id`   | | ||||||
|  | |---------------|-----------------------------------------|--------------|---------------| | ||||||
|  | | FH Aachen     | https://www.ili.fh-aachen.de            | local        | elearning     | | ||||||
|  | | Uni Köln      | https://www.ilias.uni-koeln.de/ilias    | local        | uk            | | ||||||
|  | | Uni Konstanz  | https://ilias.uni-konstanz.de           | local        | ILIASKONSTANZ | | ||||||
|  | | Uni Stuttgart | https://ilias3.uni-stuttgart.de         | local        | Uni_Stuttgart | | ||||||
|  | | Uni Tübingen  | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth   |               | | ||||||
|  |  | ||||||
|  | If your university isn't listed, try navigating to your instance's login page. | ||||||
|  | Assuming no custom login service is used, the URL will look something like this: | ||||||
|  |  | ||||||
|  | ```jinja | ||||||
|  | {{ base_url }}/login.php?client_id={{ client_id }}&cmd=force_login&lang= | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | If the values work, feel free to submit a PR and add them to the table above. | ||||||
|  |  | ||||||
|  | - `base_url`: The URL where the ILIAS instance is located. (Required) | ||||||
|  | - `login_type`: How you authenticate. (Required) | ||||||
|  |     - `local`: Use `client_id` for authentication. | ||||||
|  |     - `shibboleth`: Use shibboleth for authentication. | ||||||
|  | - `client_id`: An ID used for authentication if `login_type` is `local`. Is | ||||||
|  |   ignored if `login_type` is `shibboleth`. | ||||||
|  | - `target`: The ILIAS element to crawl. (Required) | ||||||
|  |     - `desktop`: Crawl your personal desktop / dashboard | ||||||
|  |     - `<course id>`: Crawl the course with the given id | ||||||
|  |     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||||
|  |       at the bottom of its ILIAS page).   | ||||||
|  |       This also supports the "My Courses" overview page to download *all* | ||||||
|  |       courses. Note that this might produce confusing local directory layouts | ||||||
|  |       and duplication warnings if you are a member of an ILIAS group. The | ||||||
|  |       `desktop` target is generally preferable. | ||||||
|  | - `auth`: Name of auth section to use for login. (Required) | ||||||
|  | - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||||
|  |   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||||
|  | - `links`: How to represent external links. (Default: `fancy`) | ||||||
|  |     - `ignore`: Don't download links. | ||||||
|  |     - `plaintext`: A text file containing only the URL. | ||||||
|  |     - `fancy`: A HTML file looking like the ILIAS link element. | ||||||
|  |     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||||
|  | - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||||
|  |   redirect to the actual URL. Set to a negative value to disable the automatic | ||||||
|  |   redirect. (Default: `-1`) | ||||||
|  | - `videos`: Whether to download videos. (Default: `no`) | ||||||
|  | - `forums`: Whether to download forum threads. (Default: `no`) | ||||||
|  | - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||||
|  |   `20.0`) | ||||||
|  |  | ||||||
|  | ### The `kit-ilias-web` crawler | ||||||
|  |  | ||||||
|  | This crawler crawls the KIT ILIAS instance. | ||||||
|  |  | ||||||
|  | ILIAS is not great at handling too many concurrent requests. To avoid | ||||||
|  | unnecessary load, please limit `tasks` to `1`. | ||||||
|  |  | ||||||
|  | There is a spike in ILIAS usage at the beginning of lectures, so please don't | ||||||
|  | run PFERD during those times. | ||||||
|  |  | ||||||
|  | If you're automatically running PFERD periodically (e. g. via cron or a systemd | ||||||
|  | timer), please randomize the start time or at least don't use the full hour. For | ||||||
|  | systemd timers, this can be accomplished using the `RandomizedDelaySec` option. | ||||||
|  | Also, please schedule the script to run in periods of low activity. Running the | ||||||
|  | script once per day should be fine. | ||||||
|  |  | ||||||
|  | - `target`: The ILIAS element to crawl. (Required) | ||||||
|  |     - `desktop`: Crawl your personal desktop | ||||||
|  |     - `<course id>`: Crawl the course with the given id | ||||||
|  |     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||||
|  |       at the bottom of its ILIAS page) | ||||||
|  | - `auth`: Name of auth section to use for login. (Required) | ||||||
|  | - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||||
|  |   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||||
|  | - `links`: How to represent external links. (Default: `fancy`) | ||||||
|  |     - `ignore`: Don't download links. | ||||||
|  |     - `plaintext`: A text file containing only the URL. | ||||||
|  |     - `fancy`: A HTML file looking like the ILIAS link element. | ||||||
|  |     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||||
|  | - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||||
|  |   redirect to the actual URL. Set to a negative value to disable the automatic | ||||||
|  |   redirect. (Default: `-1`) | ||||||
|  | - `videos`: Whether to download videos. (Default: `no`) | ||||||
|  | - `forums`: Whether to download forum threads. (Default: `no`) | ||||||
|  | - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||||
|  |   `20.0`) | ||||||
|  |  | ||||||
|  | ## Authenticator types | ||||||
|  |  | ||||||
|  | ### The `simple` authenticator | ||||||
|  |  | ||||||
|  | With this authenticator, the username and password can be set directly in the | ||||||
|  | config file. If the username or password are not specified, the user is prompted | ||||||
|  | via the terminal. | ||||||
|  |  | ||||||
|  | - `username`: The username. (Optional) | ||||||
|  | - `password`: The password. (Optional) | ||||||
|  |  | ||||||
|  | ### The `credential-file` authenticator | ||||||
|  |  | ||||||
|  | This authenticator reads a username and a password from a credential file. | ||||||
|  |  | ||||||
|  | - `path`: Path to the credential file. (Required) | ||||||
|  |  | ||||||
|  | The credential file has exactly two lines (trailing newline optional). The first | ||||||
|  | line starts with `username=` and contains the username, the second line starts | ||||||
|  | with `password=` and contains the password. The username and password may | ||||||
|  | contain any characters except a line break. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | username=AzureDiamond | ||||||
|  | password=hunter2 | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ### The `keyring` authenticator | ||||||
|  |  | ||||||
|  | This authenticator uses the system keyring to store passwords. The username can | ||||||
|  | be set directly in the config file. If the username is not specified, the user | ||||||
|  | is prompted via the terminal. If the keyring contains no entry or the entry is | ||||||
|  | incorrect, the user is prompted for a password via the terminal and the password | ||||||
|  | is stored in the keyring. | ||||||
|  |  | ||||||
|  | - `username`: The username. (Optional) | ||||||
|  | - `keyring_name`: The service name PFERD uses for storing credentials. (Default: | ||||||
|  |   `PFERD`) | ||||||
|  |  | ||||||
|  | ### The `pass` authenticator | ||||||
|  |  | ||||||
|  | This authenticator queries the [`pass` password manager][pass] for a username | ||||||
|  | and password. It tries to be mostly compatible with [browserpass][browserpass] | ||||||
|  | and [passff][passff], so see those links for an overview of the format. If PFERD | ||||||
|  | fails to load your password, you can use the `--explain` flag to see why. | ||||||
|  |  | ||||||
|  | - `passname`: The name of the password to use (Required) | ||||||
|  | - `username_prefixes`: A comma-separated list of username line prefixes | ||||||
|  |   (Default: `login,username,user`) | ||||||
|  | - `password_prefixes`: A comma-separated list of password line prefixes | ||||||
|  |   (Default: `password,pass,secret`) | ||||||
|  |  | ||||||
|  | [pass]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager" | ||||||
|  | [browserpass]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store" | ||||||
|  | [passff]: <https://github.com/passff/passff#multi-line-format> "Multi-line format" | ||||||
|  |  | ||||||
|  | ### The `tfa` authenticator | ||||||
|  |  | ||||||
|  | This authenticator prompts the user on the console for a two-factor | ||||||
|  | authentication token. The token is provided as password and it is not cached. | ||||||
|  | This authenticator does not support usernames. | ||||||
|  |  | ||||||
|  | ## Transformation rules | ||||||
|  |  | ||||||
|  | Transformation rules are rules for renaming and excluding files and directories. | ||||||
|  | They are specified line-by-line in a crawler's `transform` option. When a | ||||||
|  | crawler needs to apply a rule to a path, it goes through this list top-to-bottom | ||||||
|  | and applies the first matching rule. | ||||||
|  |  | ||||||
|  | To see this process in action, you can use the `--debug-transforms` or flag or | ||||||
|  | the `--explain` flag. | ||||||
|  |  | ||||||
|  | Each rule has the format `SOURCE ARROW TARGET` (e. g. `foo/bar --> foo/baz`). | ||||||
|  | The arrow specifies how the source and target are interpreted. The different | ||||||
|  | kinds of arrows are documented below. | ||||||
|  |  | ||||||
|  | `SOURCE` and `TARGET` are either a bunch of characters without spaces (e. g. | ||||||
|  | `foo/bar`) or string literals (e. g, `"foo/b a r"`). The former syntax has no | ||||||
|  | concept of escaping characters, so the backslash is just another character. The | ||||||
|  | string literals however support Python's escape syntax (e. g. | ||||||
|  | `"foo\\bar\tbaz"`). This also means that in string literals, backslashes must be | ||||||
|  | escaped. | ||||||
|  |  | ||||||
|  | `TARGET` can additionally be a single exclamation mark `!` (*not* `"!"`). When a | ||||||
|  | rule with a `!` as target matches a path, the corresponding file or directory is | ||||||
|  | ignored by the crawler instead of renamed. | ||||||
|  |  | ||||||
|  | `TARGET` can also be omitted entirely. When a rule without target matches a | ||||||
|  | path, the path is returned unmodified. This is useful to prevent rules further | ||||||
|  | down from matching instead. | ||||||
|  |  | ||||||
|  | Each arrow's behaviour can be modified slightly by changing the arrow's head | ||||||
|  | from `>` to `>>`. When a rule with a `>>` arrow head matches a path, it doesn't | ||||||
|  | return immediately like a normal arrow. Instead, it replaces the current path | ||||||
|  | with its output and continues on to the next rule. In effect, this means that | ||||||
|  | multiple rules can be applied sequentially. | ||||||
|  |  | ||||||
|  | ### The `-->` arrow | ||||||
|  |  | ||||||
|  | The `-->` arrow is a basic renaming operation for files and directories. If a | ||||||
|  | path matches `SOURCE`, it is renamed to `TARGET`. | ||||||
|  |  | ||||||
|  | Example: `foo/bar --> baz` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Converts `foo/bar` into `baz` | ||||||
|  | - Converts `foo/bar/wargl` into `baz/wargl` | ||||||
|  |  | ||||||
|  | Example: `foo/bar --> !` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Ignores `foo/bar` and any of its children | ||||||
|  |  | ||||||
|  | ### The `-name->` arrow | ||||||
|  |  | ||||||
|  | The `-name->` arrow lets you rename files and directories by their name, | ||||||
|  | regardless of where they appear in the file tree. Because of this, its `SOURCE` | ||||||
|  | must not contain multiple path segments, only a single name. This restriction | ||||||
|  | does not apply to its `TARGET`. | ||||||
|  |  | ||||||
|  | Example: `foo -name-> bar/baz` | ||||||
|  | - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||||
|  | - Converts `hello/foo` into `hello/bar/baz` | ||||||
|  | - Converts `foo/world` into `bar/baz/world` | ||||||
|  | - Converts `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz` | ||||||
|  |  | ||||||
|  | Example: `foo -name-> !` | ||||||
|  | - Doesn't match `a/foobar/b` or `x/Foo/y/z` | ||||||
|  | - Ignores any path containing a segment `foo` | ||||||
|  |  | ||||||
|  | ### The `-exact->` arrow | ||||||
|  |  | ||||||
|  | The `-exact->` arrow requires the path to match `SOURCE` exactly. The examples | ||||||
|  | below show why this is useful. | ||||||
|  |  | ||||||
|  | Example: `foo/bar -exact-> baz` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Converts `foo/bar` into `baz` | ||||||
|  | - Doesn't match `foo/bar/wargl` | ||||||
|  |  | ||||||
|  | Example: `foo/bar -exact-> !` | ||||||
|  | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|  | - Ignores only `foo/bar`, not its children | ||||||
|  |  | ||||||
|  | ### The `-re->` arrow | ||||||
|  |  | ||||||
|  | The `-re->` arrow is like the `-->` arrow but with regular expressions. `SOURCE` | ||||||
|  | is a regular expression and `TARGET` an f-string based template. If a path | ||||||
|  | matches `SOURCE`, the output path is created using `TARGET` as template. | ||||||
|  | `SOURCE` is automatically anchored. | ||||||
|  |  | ||||||
|  | `TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can | ||||||
|  | be referred to as `{g<n>}` (e.g. `{g3}`). `{g0}` refers to the original path. | ||||||
|  | If capturing group *n*'s contents are a valid integer, the integer value is | ||||||
|  | available as `{i<n>}` (e.g. `{i3}`). If capturing group *n*'s contents are a | ||||||
|  | valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). Named capture | ||||||
|  | groups (e.g. `(?P<name>)`) are available by their name (e.g. `{name}`). If a | ||||||
|  | capturing group is not present (e.g. when matching the string `cd` with the | ||||||
|  | regex `(ab)?cd`), the corresponding variables are not defined. | ||||||
|  |  | ||||||
|  | Python's format string syntax has rich options for formatting its arguments. For | ||||||
|  | example, to left-pad the capturing group 3 with the digit `0` to width 5, you | ||||||
|  | can use `{i3:05}`. | ||||||
|  |  | ||||||
|  | PFERD even allows you to write entire expressions inside the curly braces, for | ||||||
|  | example `{g2.lower()}` or `{g3.replace(' ', '_')}`. | ||||||
|  |  | ||||||
|  | Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` | ||||||
|  | - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||||
|  | - Converts `foo/bar` into `BOOH/fear` | ||||||
|  | - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||||
|  | - Converts `foo/bar/baz` into `BOOH/fear/baz` | ||||||
|  |  | ||||||
|  | [6]: <https://docs.python.org/3/library/string.html#format-string-syntax> "Format String Syntax" | ||||||
|  |  | ||||||
|  | ### The `-name-re->` arrow | ||||||
|  |  | ||||||
|  | The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. | ||||||
|  |  | ||||||
|  | Example: `(.*)\.jpeg -name-re-> {g1}.jpg` | ||||||
|  | - Doesn't match `foo/bar.png`, `baz.JPEG` or `hello,jpeg` | ||||||
|  | - Converts `foo/bar.jpeg` into `foo/bar.jpg` | ||||||
|  | - Converts `foo.jpeg/bar/baz.jpeg` into `foo.jpg/bar/baz.jpg` | ||||||
|  |  | ||||||
|  | Example: `\..+ -name-re-> !` | ||||||
|  | - Doesn't match `.`, `test`, `a.b` | ||||||
|  | - Ignores all files and directories starting with `.`. | ||||||
|  |  | ||||||
|  | ### The `-exact-re->` arrow | ||||||
|  |  | ||||||
|  | The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` | ||||||
|  | arrows. | ||||||
|  |  | ||||||
|  | Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` | ||||||
|  | - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` | ||||||
|  | - Converts `foo/bar` into `BOOH/fear` | ||||||
|  | - Converts `fooooo/bear` into `BOOOOOH/fear` | ||||||
|  | - Doesn't match `foo/bar/baz` | ||||||
|  |  | ||||||
|  | ### Example: Tutorials | ||||||
|  |  | ||||||
|  | You have an ILIAS course with lots of tutorials, but are only interested in a | ||||||
|  | single one. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | tutorials/ | ||||||
|  |   |- tut_01/ | ||||||
|  |   |- tut_02/ | ||||||
|  |   |- tut_03/ | ||||||
|  |   ... | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | You can use a mix of normal and exact arrows to get rid of the other ones and | ||||||
|  | move the `tutorials/tut_02/` folder to `my_tut/`: | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | tutorials/tut_02 --> my_tut | ||||||
|  | tutorials -exact-> | ||||||
|  | tutorials --> ! | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | The second rule is required for many crawlers since they use the rules to decide | ||||||
|  | which directories to crawl. If it was missing when the crawler looks at | ||||||
|  | `tutorials/`, the third rule would match. This means the crawler would not crawl | ||||||
|  | the `tutorials/` directory and thus not discover that `tutorials/tut02/` exists. | ||||||
|  |  | ||||||
|  | Since the second rule is only relevant for crawling, the `TARGET` is left out. | ||||||
|  |  | ||||||
|  | ### Example: Lecture slides | ||||||
|  |  | ||||||
|  | You have a course with slides like `Lecture 3: Linear functions.PDF` and you | ||||||
|  | would like to rename them to `03_linear_functions.pdf`. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | Lectures/ | ||||||
|  |   |- Lecture 1: Introduction.PDF | ||||||
|  |   |- Lecture 2: Vectors and matrices.PDF | ||||||
|  |   |- Lecture 3: Linear functions.PDF | ||||||
|  |   ... | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | To do this, you can use the most powerful of arrows: The regex arrow. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | "Lectures/Lecture (\\d+): (.*)\\.PDF" -re-> "Lectures/{i1:02}_{g2.lower().replace(' ', '_')}.pdf" | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | Note the escaped backslashes on the `SOURCE` side. | ||||||
|  |  | ||||||
|  | ### Example: Crawl a Python project | ||||||
|  |  | ||||||
|  | You are crawling a Python project and want to ignore all hidden files (files | ||||||
|  | whose name starts with a `.`), all `__pycache__` directories and all markdown | ||||||
|  | files (for some weird reason). | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | .gitignore | ||||||
|  | .mypy_cache/ | ||||||
|  | .venv/ | ||||||
|  | CONFIG.md | ||||||
|  | PFERD/ | ||||||
|  |   |- __init__.py | ||||||
|  |   |- __main__.py | ||||||
|  |   |- __pycache__/ | ||||||
|  |   |- authenticator.py | ||||||
|  |   |- config.py | ||||||
|  |   ... | ||||||
|  | README.md | ||||||
|  | ... | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | For this task, the name arrows can be used. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | \..*        -name-re-> ! | ||||||
|  | __pycache__ -name->    ! | ||||||
|  | .*\.md      -name-re-> ! | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ### Example: Clean up names | ||||||
|  |  | ||||||
|  | You want to convert all paths into lowercase and replace spaces with underscores | ||||||
|  | before applying any rules. This can be achieved using the `>>` arrow heads. | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | (.*) -re->> "{g1.lower().replace(' ', '_')}" | ||||||
|  |  | ||||||
|  | <other rules go here> | ||||||
|  | ``` | ||||||
							
								
								
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								DEV.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,89 @@ | |||||||
|  | # PFERD Development Guide | ||||||
|  |  | ||||||
|  | PFERD is packaged following the [Python Packaging User Guide][ppug] (in | ||||||
|  | particular [this][ppug-1] and [this][ppug-2] guide). | ||||||
|  |  | ||||||
|  | [ppug]: <https://packaging.python.org/> "Python Packaging User Guide" | ||||||
|  | [ppug-1]: <https://packaging.python.org/tutorials/packaging-projects/> "Packaging Python Projects" | ||||||
|  | [ppug-2]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/> "Packaging and distributing projects" | ||||||
|  |  | ||||||
|  | ## Setting up a dev environment | ||||||
|  |  | ||||||
|  | The use of [venv][venv] is recommended. To initially set up a development | ||||||
|  | environment, run these commands in the same directory as this file: | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | $ python -m venv .venv | ||||||
|  | $ . .venv/bin/activate | ||||||
|  | $ ./scripts/setup | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | The setup script installs a few required dependencies and tools. It also | ||||||
|  | installs PFERD via `pip install --editable .`, which means that you can just run | ||||||
|  | `pferd` as if it was installed normally. Since PFERD was installed with | ||||||
|  | `--editable`, there is no need to re-run `pip install` when the source code is | ||||||
|  | changed. | ||||||
|  |  | ||||||
|  | If you get any errors because pip can't update itself, try running | ||||||
|  | `./scripts/setup --no-pip` instead of `./scripts/setup`. | ||||||
|  |  | ||||||
|  | For more details, see [this part of the Python Tutorial][venv-tut] and | ||||||
|  | [this section on "development mode"][ppug-dev]. | ||||||
|  |  | ||||||
|  | [venv]: <https://docs.python.org/3/library/venv.html> "venv - Creation of virtual environments" | ||||||
|  | [venv-tut]: <https://docs.python.org/3/tutorial/venv.html> "12. Virtual Environments and Packages" | ||||||
|  | [ppug-dev]: <https://packaging.python.org/guides/distributing-packages-using-setuptools/#working-in-development-mode> "Working in “development mode”" | ||||||
|  |  | ||||||
|  | ## Checking and formatting the code | ||||||
|  |  | ||||||
|  | To run a set of checks against the code, run `./scripts/check` in the repo's | ||||||
|  | root directory. This script will run a few tools installed by `./scripts/setup` | ||||||
|  | against the entire project. | ||||||
|  |  | ||||||
|  | To format the code, run `./scripts/format` in the repo's root directory. | ||||||
|  |  | ||||||
|  | Before committing changes, please make sure the checks return no warnings and | ||||||
|  | the code is formatted. | ||||||
|  |  | ||||||
|  | ## Contributing | ||||||
|  |  | ||||||
|  | When submitting a PR that adds, changes or modifies a feature, please ensure | ||||||
|  | that the corresponding documentation is updated as well. Also, please ensure | ||||||
|  | that `./scripts/check` returns no warnings and the code has been run through | ||||||
|  | `./scripts/format`. | ||||||
|  |  | ||||||
|  | In your first PR, please add your name to the `LICENSE` file. | ||||||
|  |  | ||||||
|  | ## Releasing a new version | ||||||
|  |  | ||||||
|  | This section describes the steps required to release a new version of PFERD. | ||||||
|  | Usually, they don't need to performed manually and `scripts/bump-version` can be | ||||||
|  | used instead. | ||||||
|  |  | ||||||
|  | 1. Update the version number in `PFERD/version.py` | ||||||
|  | 2. Update `CHANGELOG.md` | ||||||
|  | 3. Commit changes to `master` with message `Bump version to <version>` (e. g. `Bump version to 3.2.5`) | ||||||
|  | 4. Create annotated tag named `v<version>` (e. g. `v3.2.5`) | ||||||
|  |     - Copy changes from changelog | ||||||
|  |     - Remove `#` symbols (which git would interpret as comments) | ||||||
|  |     - As the first line, add `Version <version> - <date>` (e. g. `Version 3.2.5 - 2021-05-24`) | ||||||
|  |     - Leave the second line empty | ||||||
|  | 5. Fast-forward `latest` to `master` | ||||||
|  | 6. Push `master`, `latest` and the new tag | ||||||
|  |  | ||||||
|  | Example tag annotation: | ||||||
|  | ``` | ||||||
|  | Version 3.2.5 - 2021-05-24 | ||||||
|  |  | ||||||
|  | Added | ||||||
|  | - Support for concurrent downloads | ||||||
|  | - Support for proper config files | ||||||
|  | - This changelog | ||||||
|  |  | ||||||
|  | Changed | ||||||
|  | - Rewrote almost everything | ||||||
|  | - Redesigned CLI | ||||||
|  |  | ||||||
|  | Removed | ||||||
|  | - Backwards compatibility with 2.x | ||||||
|  | ``` | ||||||
							
								
								
									
										20
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | Copyright 2019-2024 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||||
|  |                     TheChristophe, Scriptim, thelukasprobst, Toorero, | ||||||
|  |                     Mr-Pine, p-fruck, PinieP | ||||||
|  |  | ||||||
|  | Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||||
|  | this software and associated documentation files (the "Software"), to deal in | ||||||
|  | the Software without restriction, including without limitation the rights to | ||||||
|  | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||||||
|  | the Software, and to permit persons to whom the Software is furnished to do so, | ||||||
|  | subject to the following conditions: | ||||||
|  |  | ||||||
|  | The above copyright notice and this permission notice shall be included in all | ||||||
|  | copies or substantial portions of the Software. | ||||||
|  |  | ||||||
|  | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||||
|  | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||||||
|  | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||||||
|  | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||||||
|  | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||||||
|  | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||||||
| @@ -1,37 +0,0 @@ | |||||||
| import logging |  | ||||||
|  |  | ||||||
| from .ffm import * |  | ||||||
| from .ilias import * |  | ||||||
| from .norbert import * |  | ||||||
| from .ti import * |  | ||||||
| from .utils import * |  | ||||||
|  |  | ||||||
| __all__ = ["STYLE", "FORMAT", "DATE_FORMAT", "FORMATTER", "enable_logging"] |  | ||||||
|  |  | ||||||
| __all__ += ffm.__all__ |  | ||||||
| __all__ += ilias.__all__ |  | ||||||
| __all__ += norbert.__all__ |  | ||||||
| __all__ += ti.__all__ |  | ||||||
| __all__ += utils.__all__ |  | ||||||
|  |  | ||||||
| STYLE = "{" |  | ||||||
| FORMAT = "[{levelname:<7}] {message}" |  | ||||||
| DATE_FORMAT = "%F %T" |  | ||||||
|  |  | ||||||
| FORMATTER = logging.Formatter( |  | ||||||
|         fmt=FORMAT, |  | ||||||
|         datefmt=DATE_FORMAT, |  | ||||||
|         style=STYLE, |  | ||||||
| ) |  | ||||||
|  |  | ||||||
| def enable_logging(name="PFERD", level=logging.INFO): |  | ||||||
|     handler = logging.StreamHandler() |  | ||||||
|     handler.setFormatter(FORMATTER) |  | ||||||
|  |  | ||||||
|     logger = logging.getLogger(name) |  | ||||||
|     logger.setLevel(level) |  | ||||||
|     logger.addHandler(handler) |  | ||||||
|  |  | ||||||
|     # This should be logged by our own handler, and not the root logger's |  | ||||||
|     # default handler, so we don't pass it on to the root logger. |  | ||||||
|     logger.propagate = False |  | ||||||
|   | |||||||
							
								
								
									
										169
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										169
									
								
								PFERD/__main__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,169 @@ | |||||||
|  | import argparse | ||||||
|  | import asyncio | ||||||
|  | import configparser | ||||||
|  | import os | ||||||
|  | import sys | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from .auth import AuthLoadError | ||||||
|  | from .cli import PARSER, ParserLoadError, load_default_section | ||||||
|  | from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError | ||||||
|  | from .logging import log | ||||||
|  | from .pferd import Pferd, PferdLoadError | ||||||
|  | from .transformer import RuleParseError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: | ||||||
|  |     log.explain_topic("Loading config") | ||||||
|  |     parser = configparser.ConfigParser(interpolation=None) | ||||||
|  |  | ||||||
|  |     if args.command is None: | ||||||
|  |         log.explain("No CLI command specified, loading config from file") | ||||||
|  |         Config.load_parser(parser, path=args.config) | ||||||
|  |     else: | ||||||
|  |         log.explain("CLI command specified, loading config from its arguments") | ||||||
|  |         if args.command: | ||||||
|  |             args.command(args, parser) | ||||||
|  |  | ||||||
|  |     load_default_section(args, parser) | ||||||
|  |  | ||||||
|  |     return parser | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_config(args: argparse.Namespace) -> Config: | ||||||
|  |     try: | ||||||
|  |         return Config(load_config_parser(args)) | ||||||
|  |     except ConfigLoadError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         log.error_contd(e.reason) | ||||||
|  |         sys.exit(1) | ||||||
|  |     except ParserLoadError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def configure_logging_from_args(args: argparse.Namespace) -> None: | ||||||
|  |     if args.explain is not None: | ||||||
|  |         log.output_explain = args.explain | ||||||
|  |     if args.status is not None: | ||||||
|  |         log.output_status = args.status | ||||||
|  |     if args.show_not_deleted is not None: | ||||||
|  |         log.output_not_deleted = args.show_not_deleted | ||||||
|  |     if args.report is not None: | ||||||
|  |         log.output_report = args.report | ||||||
|  |  | ||||||
|  |     # We want to prevent any unnecessary output if we're printing the config to | ||||||
|  |     # stdout, otherwise it would not be a valid config file. | ||||||
|  |     if args.dump_config_to == "-": | ||||||
|  |         log.output_explain = False | ||||||
|  |         log.output_status = False | ||||||
|  |         log.output_report = False | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None: | ||||||
|  |     # In configure_logging_from_args(), all normal logging is already disabled | ||||||
|  |     # whenever we dump the config. We don't want to override that decision with | ||||||
|  |     # values from the config file. | ||||||
|  |     if args.dump_config_to == "-": | ||||||
|  |         return | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         if args.explain is None: | ||||||
|  |             log.output_explain = config.default_section.explain() | ||||||
|  |         if args.status is None: | ||||||
|  |             log.output_status = config.default_section.status() | ||||||
|  |         if args.report is None: | ||||||
|  |             log.output_report = config.default_section.report() | ||||||
|  |         if args.show_not_deleted is None: | ||||||
|  |             log.output_not_deleted = config.default_section.show_not_deleted() | ||||||
|  |     except ConfigOptionError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def dump_config(args: argparse.Namespace, config: Config) -> None: | ||||||
|  |     log.explain_topic("Dumping config") | ||||||
|  |  | ||||||
|  |     if args.dump_config and args.dump_config_to is not None: | ||||||
|  |         log.error("--dump-config and --dump-config-to can't be specified at the same time") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         if args.dump_config: | ||||||
|  |             config.dump() | ||||||
|  |         elif args.dump_config_to == "-": | ||||||
|  |             config.dump_to_stdout() | ||||||
|  |         else: | ||||||
|  |             config.dump(Path(args.dump_config_to)) | ||||||
|  |     except ConfigDumpError as e: | ||||||
|  |         log.error(str(e)) | ||||||
|  |         log.error_contd(e.reason) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main() -> None: | ||||||
|  |     args = PARSER.parse_args() | ||||||
|  |  | ||||||
|  |     # Configuring logging happens in two stages because CLI args have | ||||||
|  |     # precedence over config file options and loading the config already | ||||||
|  |     # produces some kinds of log messages (usually only explain()-s). | ||||||
|  |     configure_logging_from_args(args) | ||||||
|  |  | ||||||
|  |     config = load_config(args) | ||||||
|  |  | ||||||
|  |     # Now, after loading the config file, we can apply its logging settings in | ||||||
|  |     # all places that were not already covered by CLI args. | ||||||
|  |     configure_logging_from_config(args, config) | ||||||
|  |  | ||||||
|  |     if args.dump_config or args.dump_config_to is not None: | ||||||
|  |         dump_config(args, config) | ||||||
|  |         sys.exit() | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         pferd = Pferd(config, args.crawler, args.skip) | ||||||
|  |     except PferdLoadError as e: | ||||||
|  |         log.unlock() | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         if os.name == "nt": | ||||||
|  |             # A "workaround" for the windows event loop somehow crashing after | ||||||
|  |             # asyncio.run() completes. See: | ||||||
|  |             # https://bugs.python.org/issue39232 | ||||||
|  |             # https://github.com/encode/httpx/issues/914#issuecomment-780023632 | ||||||
|  |             # TODO Fix this properly | ||||||
|  |             loop = asyncio.get_event_loop() | ||||||
|  |             loop.run_until_complete(pferd.run(args.debug_transforms)) | ||||||
|  |             loop.run_until_complete(asyncio.sleep(1)) | ||||||
|  |             loop.close() | ||||||
|  |         else: | ||||||
|  |             asyncio.run(pferd.run(args.debug_transforms)) | ||||||
|  |     except (ConfigOptionError, AuthLoadError) as e: | ||||||
|  |         log.unlock() | ||||||
|  |         log.error(str(e)) | ||||||
|  |         sys.exit(1) | ||||||
|  |     except RuleParseError as e: | ||||||
|  |         log.unlock() | ||||||
|  |         e.pretty_print() | ||||||
|  |         sys.exit(1) | ||||||
|  |     except KeyboardInterrupt: | ||||||
|  |         log.unlock() | ||||||
|  |         log.explain_topic("Interrupted, exiting immediately") | ||||||
|  |         log.explain("Open files and connections are left for the OS to clean up") | ||||||
|  |         pferd.print_report() | ||||||
|  |         # TODO Clean up tmp files | ||||||
|  |         # And when those files *do* actually get cleaned up properly, | ||||||
|  |         # reconsider if this should really exit with 1 | ||||||
|  |         sys.exit(1) | ||||||
|  |     except Exception: | ||||||
|  |         log.unlock() | ||||||
|  |         log.unexpected_exception() | ||||||
|  |         pferd.print_report() | ||||||
|  |         sys.exit(1) | ||||||
|  |     else: | ||||||
|  |         pferd.print_report() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								PFERD/auth/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | |||||||
|  | from configparser import SectionProxy | ||||||
|  | from typing import Callable, Dict | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection  # noqa: F401 | ||||||
|  | from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection | ||||||
|  | from .keyring import KeyringAuthenticator, KeyringAuthSection | ||||||
|  | from .pass_ import PassAuthenticator, PassAuthSection | ||||||
|  | from .simple import SimpleAuthenticator, SimpleAuthSection | ||||||
|  | from .tfa import TfaAuthenticator | ||||||
|  |  | ||||||
|  | AuthConstructor = Callable[[ | ||||||
|  |     str,                # Name (without the "auth:" prefix) | ||||||
|  |     SectionProxy,       # Authenticator's section of global config | ||||||
|  |     Config,             # Global config | ||||||
|  | ], Authenticator] | ||||||
|  |  | ||||||
|  | AUTHENTICATORS: Dict[str, AuthConstructor] = { | ||||||
|  |     "credential-file": lambda n, s, c: | ||||||
|  |         CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), | ||||||
|  |     "keyring": lambda n, s, c: | ||||||
|  |         KeyringAuthenticator(n, KeyringAuthSection(s)), | ||||||
|  |     "pass": lambda n, s, c: | ||||||
|  |         PassAuthenticator(n, PassAuthSection(s)), | ||||||
|  |     "simple": lambda n, s, c: | ||||||
|  |         SimpleAuthenticator(n, SimpleAuthSection(s)), | ||||||
|  |     "tfa": lambda n, s, c: | ||||||
|  |         TfaAuthenticator(n), | ||||||
|  | } | ||||||
							
								
								
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								PFERD/auth/authenticator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | |||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from typing import Tuple | ||||||
|  |  | ||||||
|  | from ..config import Section | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class AuthLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class AuthError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class AuthSection(Section): | ||||||
|  |     def type(self) -> str: | ||||||
|  |         value = self.s.get("type") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("type") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Authenticator(ABC): | ||||||
|  |     def __init__(self, name: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Initialize an authenticator from its name and its section in the config | ||||||
|  |         file. | ||||||
|  |  | ||||||
|  |         If you are writing your own constructor for your own authenticator, | ||||||
|  |         make sure to call this constructor first (via super().__init__). | ||||||
|  |  | ||||||
|  |         May throw an AuthLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.name = name | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     async def username(self) -> str: | ||||||
|  |         username, _ = await self.credentials() | ||||||
|  |         return username | ||||||
|  |  | ||||||
|  |     async def password(self) -> str: | ||||||
|  |         _, password = await self.credentials() | ||||||
|  |         return password | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Tell the authenticator that some or all of its credentials are invalid. | ||||||
|  |  | ||||||
|  |         Authenticators should overwrite this function if they have a way to | ||||||
|  |         deal with this issue that is likely to result in valid credentials | ||||||
|  |         (e. g. prompting the user). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         raise AuthError("Invalid credentials") | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Tell the authenticator that specifically its username is invalid. | ||||||
|  |  | ||||||
|  |         Authenticators should overwrite this function if they have a way to | ||||||
|  |         deal with this issue that is likely to result in valid credentials | ||||||
|  |         (e. g. prompting the user). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         raise AuthError("Invalid username") | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Tell the authenticator that specifically its password is invalid. | ||||||
|  |  | ||||||
|  |         Authenticators should overwrite this function if they have a way to | ||||||
|  |         deal with this issue that is likely to result in valid credentials | ||||||
|  |         (e. g. prompting the user). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         raise AuthError("Invalid password") | ||||||
							
								
								
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								PFERD/auth/credential_file.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | |||||||
|  | from pathlib import Path | ||||||
|  | from typing import Tuple | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from ..utils import fmt_real_path | ||||||
|  | from .authenticator import Authenticator, AuthLoadError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CredentialFileAuthSection(AuthSection): | ||||||
|  |     def path(self) -> Path: | ||||||
|  |         value = self.s.get("path") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("path") | ||||||
|  |         return Path(value) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CredentialFileAuthenticator(Authenticator): | ||||||
|  |     def __init__(self, name: str, section: CredentialFileAuthSection, config: Config) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         path = config.default_section.working_dir() / section.path() | ||||||
|  |         try: | ||||||
|  |             with open(path, encoding="utf-8") as f: | ||||||
|  |                 lines = list(f) | ||||||
|  |         except UnicodeDecodeError: | ||||||
|  |             raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8") | ||||||
|  |         except OSError as e: | ||||||
|  |             raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e | ||||||
|  |  | ||||||
|  |         if len(lines) != 2: | ||||||
|  |             raise AuthLoadError("Credential file must be two lines long") | ||||||
|  |         [uline, pline] = lines | ||||||
|  |         uline = uline[:-1]  # Remove trailing newline | ||||||
|  |         if pline.endswith("\n"): | ||||||
|  |             pline = pline[:-1] | ||||||
|  |  | ||||||
|  |         if not uline.startswith("username="): | ||||||
|  |             raise AuthLoadError("First line must start with 'username='") | ||||||
|  |         if not pline.startswith("password="): | ||||||
|  |             raise AuthLoadError("Second line must start with 'password='") | ||||||
|  |  | ||||||
|  |         self._username = uline[9:] | ||||||
|  |         self._password = pline[9:] | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         return self._username, self._password | ||||||
							
								
								
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								PFERD/auth/keyring.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,65 @@ | |||||||
|  | from typing import Optional, Tuple, cast | ||||||
|  |  | ||||||
|  | import keyring | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import agetpass, ainput | ||||||
|  | from ..version import NAME | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KeyringAuthSection(AuthSection): | ||||||
|  |     def username(self) -> Optional[str]: | ||||||
|  |         return self.s.get("username") | ||||||
|  |  | ||||||
|  |     def keyring_name(self) -> str: | ||||||
|  |         return cast(str, self.s.get("keyring_name", fallback=NAME)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KeyringAuthenticator(Authenticator): | ||||||
|  |  | ||||||
|  |     def __init__(self, name: str, section: KeyringAuthSection) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         self._username = section.username() | ||||||
|  |         self._password: Optional[str] = None | ||||||
|  |         self._keyring_name = section.keyring_name() | ||||||
|  |  | ||||||
|  |         self._password_invalidated = False | ||||||
|  |         self._username_fixed = section.username() is not None | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         # Request the username | ||||||
|  |         if self._username is None: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 self._username = await ainput("Username: ") | ||||||
|  |  | ||||||
|  |         # First try looking it up in the keyring. | ||||||
|  |         # Do not look it up if it was invalidated - we want to re-prompt in this case | ||||||
|  |         if self._password is None and not self._password_invalidated: | ||||||
|  |             self._password = keyring.get_password(self._keyring_name, self._username) | ||||||
|  |  | ||||||
|  |         # If that fails it wasn't saved in the keyring - we need to | ||||||
|  |         # read it from the user and store it | ||||||
|  |         if self._password is None: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 self._password = await agetpass("Password: ") | ||||||
|  |                 keyring.set_password(self._keyring_name, self._username, self._password) | ||||||
|  |  | ||||||
|  |         self._password_invalidated = False | ||||||
|  |         return self._username, self._password | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         if not self._username_fixed: | ||||||
|  |             self.invalidate_username() | ||||||
|  |         self.invalidate_password() | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         if self._username_fixed: | ||||||
|  |             raise AuthError("Configured username is invalid") | ||||||
|  |         else: | ||||||
|  |             self._username = None | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         self._password = None | ||||||
|  |         self._password_invalidated = True | ||||||
							
								
								
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								PFERD/auth/pass_.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,98 @@ | |||||||
|  | import re | ||||||
|  | import subprocess | ||||||
|  | from typing import List, Tuple | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PassAuthSection(AuthSection): | ||||||
|  |     def passname(self) -> str: | ||||||
|  |         if (value := self.s.get("passname")) is None: | ||||||
|  |             self.missing_value("passname") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def username_prefixes(self) -> List[str]: | ||||||
|  |         value = self.s.get("username_prefixes", "login,username,user") | ||||||
|  |         return [prefix.lower() for prefix in value.split(",")] | ||||||
|  |  | ||||||
|  |     def password_prefixes(self) -> List[str]: | ||||||
|  |         value = self.s.get("password_prefixes", "password,pass,secret") | ||||||
|  |         return [prefix.lower() for prefix in value.split(",")] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PassAuthenticator(Authenticator): | ||||||
|  |     PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)"  # to be used with fullmatch | ||||||
|  |  | ||||||
|  |     def __init__(self, name: str, section: PassAuthSection) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         self._passname = section.passname() | ||||||
|  |         self._username_prefixes = section.username_prefixes() | ||||||
|  |         self._password_prefixes = section.password_prefixes() | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         log.explain_topic("Obtaining credentials from pass") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             log.explain(f"Calling 'pass show {self._passname}'") | ||||||
|  |             result = subprocess.check_output(["pass", "show", self._passname], text=True) | ||||||
|  |         except subprocess.CalledProcessError as e: | ||||||
|  |             raise AuthError(f"Failed to get password info from {self._passname}: {e}") | ||||||
|  |  | ||||||
|  |         prefixed = {} | ||||||
|  |         unprefixed = [] | ||||||
|  |         for line in result.strip().splitlines(): | ||||||
|  |             if match := re.fullmatch(self.PREFIXED_LINE_RE, line): | ||||||
|  |                 prefix = match.group(1).lower() | ||||||
|  |                 value = match.group(2) | ||||||
|  |                 log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") | ||||||
|  |                 if prefix in prefixed: | ||||||
|  |                     raise AuthError(f"Prefix {prefix} specified multiple times") | ||||||
|  |                 prefixed[prefix] = value | ||||||
|  |             else: | ||||||
|  |                 log.explain(f"Found unprefixed line {line!r}") | ||||||
|  |                 unprefixed.append(line) | ||||||
|  |  | ||||||
|  |         username = None | ||||||
|  |         for prefix in self._username_prefixes: | ||||||
|  |             log.explain(f"Looking for username at prefix {prefix!r}") | ||||||
|  |             if prefix in prefixed: | ||||||
|  |                 username = prefixed[prefix] | ||||||
|  |                 log.explain(f"Found username {username!r}") | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |         password = None | ||||||
|  |         for prefix in self._password_prefixes: | ||||||
|  |             log.explain(f"Looking for password at prefix {prefix!r}") | ||||||
|  |             if prefix in prefixed: | ||||||
|  |                 password = prefixed[prefix] | ||||||
|  |                 log.explain(f"Found password {password!r}") | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |         if password is None and username is None: | ||||||
|  |             log.explain("No username and password found so far") | ||||||
|  |             log.explain("Using first unprefixed line as password") | ||||||
|  |             log.explain("Using second unprefixed line as username") | ||||||
|  |         elif password is None: | ||||||
|  |             log.explain("No password found so far") | ||||||
|  |             log.explain("Using first unprefixed line as password") | ||||||
|  |         elif username is None: | ||||||
|  |             log.explain("No username found so far") | ||||||
|  |             log.explain("Using first unprefixed line as username") | ||||||
|  |  | ||||||
|  |         if password is None: | ||||||
|  |             if not unprefixed: | ||||||
|  |                 log.explain("Not enough unprefixed lines left") | ||||||
|  |                 raise AuthError("Password could not be determined") | ||||||
|  |             password = unprefixed.pop(0) | ||||||
|  |             log.explain(f"Found password {password!r}") | ||||||
|  |  | ||||||
|  |         if username is None: | ||||||
|  |             if not unprefixed: | ||||||
|  |                 log.explain("Not enough unprefixed lines left") | ||||||
|  |                 raise AuthError("Username could not be determined") | ||||||
|  |             username = unprefixed.pop(0) | ||||||
|  |             log.explain(f"Found username {username!r}") | ||||||
|  |  | ||||||
|  |         return username, password | ||||||
							
								
								
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								PFERD/auth/simple.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | |||||||
|  | from typing import Optional, Tuple | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import agetpass, ainput | ||||||
|  | from .authenticator import Authenticator, AuthError, AuthSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class SimpleAuthSection(AuthSection): | ||||||
|  |     def username(self) -> Optional[str]: | ||||||
|  |         return self.s.get("username") | ||||||
|  |  | ||||||
|  |     def password(self) -> Optional[str]: | ||||||
|  |         return self.s.get("password") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class SimpleAuthenticator(Authenticator): | ||||||
|  |     def __init__(self, name: str, section: SimpleAuthSection) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |         self._username = section.username() | ||||||
|  |         self._password = section.password() | ||||||
|  |  | ||||||
|  |         self._username_fixed = self.username is not None | ||||||
|  |         self._password_fixed = self.password is not None | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         if self._username is not None and self._password is not None: | ||||||
|  |             return self._username, self._password | ||||||
|  |  | ||||||
|  |         async with log.exclusive_output(): | ||||||
|  |             if self._username is None: | ||||||
|  |                 self._username = await ainput("Username: ") | ||||||
|  |             else: | ||||||
|  |                 print(f"Username: {self._username}") | ||||||
|  |  | ||||||
|  |             if self._password is None: | ||||||
|  |                 self._password = await agetpass("Password: ") | ||||||
|  |  | ||||||
|  |             # Intentionally returned inside the context manager so we know | ||||||
|  |             # they're both not None | ||||||
|  |             return self._username, self._password | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         if self._username_fixed and self._password_fixed: | ||||||
|  |             raise AuthError("Configured credentials are invalid") | ||||||
|  |  | ||||||
|  |         if not self._username_fixed: | ||||||
|  |             self._username = None | ||||||
|  |         if not self._password_fixed: | ||||||
|  |             self._password = None | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         if self._username_fixed: | ||||||
|  |             raise AuthError("Configured username is invalid") | ||||||
|  |         else: | ||||||
|  |             self._username = None | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         if self._password_fixed: | ||||||
|  |             raise AuthError("Configured password is invalid") | ||||||
|  |         else: | ||||||
|  |             self._password = None | ||||||
							
								
								
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								PFERD/auth/tfa.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | |||||||
|  | from typing import Tuple | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import ainput | ||||||
|  | from .authenticator import Authenticator, AuthError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TfaAuthenticator(Authenticator): | ||||||
|  |     def __init__(self, name: str) -> None: | ||||||
|  |         super().__init__(name) | ||||||
|  |  | ||||||
|  |     async def username(self) -> str: | ||||||
|  |         raise AuthError("TFA authenticator does not support usernames") | ||||||
|  |  | ||||||
|  |     async def password(self) -> str: | ||||||
|  |         async with log.exclusive_output(): | ||||||
|  |             code = await ainput("TFA code: ") | ||||||
|  |             return code | ||||||
|  |  | ||||||
|  |     async def credentials(self) -> Tuple[str, str]: | ||||||
|  |         raise AuthError("TFA authenticator does not support usernames") | ||||||
|  |  | ||||||
|  |     def invalidate_username(self) -> None: | ||||||
|  |         raise AuthError("TFA authenticator does not support usernames") | ||||||
|  |  | ||||||
|  |     def invalidate_password(self) -> None: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     def invalidate_credentials(self) -> None: | ||||||
|  |         pass | ||||||
							
								
								
									
										14
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								PFERD/cli/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | # isort: skip_file | ||||||
|  |  | ||||||
|  | # The order of imports matters because each command module registers itself | ||||||
|  | # with the parser from ".parser" and the import order affects the order in | ||||||
|  | # which they appear in the help. Because of this, isort is disabled for this | ||||||
|  | # file. Also, since we're reexporting or just using the side effect of | ||||||
|  | # importing itself, we get a few linting warnings, which we're disabling as | ||||||
|  | # well. | ||||||
|  |  | ||||||
|  | from . import command_local  # noqa: F401 imported but unused | ||||||
|  | from . import command_ilias_web  # noqa: F401 imported but unused | ||||||
|  | from . import command_kit_ilias_web  # noqa: F401 imported but unused | ||||||
|  | from . import command_kit_ipd  # noqa: F401 imported but unused | ||||||
|  | from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused | ||||||
							
								
								
									
										56
									
								
								PFERD/cli/command_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								PFERD/cli/command_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .common_ilias_args import configure_common_group_args, load_common | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | COMMAND_NAME = "ilias-web" | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     COMMAND_NAME, | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title=f"{COMMAND_NAME} crawler arguments", | ||||||
|  |     description=f"arguments for the '{COMMAND_NAME}' crawler", | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--base-url", | ||||||
|  |     type=str, | ||||||
|  |     metavar="BASE_URL", | ||||||
|  |     help="The base url of the ilias instance" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--client-id", | ||||||
|  |     type=str, | ||||||
|  |     metavar="CLIENT_ID", | ||||||
|  |     help="The client id of the ilias instance" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | configure_common_group_args(GROUP) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain(f"Creating config for command '{COMMAND_NAME}'") | ||||||
|  |  | ||||||
|  |     parser["crawl:ilias"] = {} | ||||||
|  |     section = parser["crawl:ilias"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = COMMAND_NAME | ||||||
|  |     if args.ilias_url is not None: | ||||||
|  |         section["base_url"] = args.ilias_url | ||||||
|  |     if args.client_id is not None: | ||||||
|  |         section["client_id"] = args.client_id | ||||||
|  |  | ||||||
|  |     load_common(section, args, parser) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										37
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								PFERD/cli/command_kit_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .common_ilias_args import configure_common_group_args, load_common | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | COMMAND_NAME = "kit-ilias-web" | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     COMMAND_NAME, | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title=f"{COMMAND_NAME} crawler arguments", | ||||||
|  |     description=f"arguments for the '{COMMAND_NAME}' crawler", | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | configure_common_group_args(GROUP) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain(f"Creating config for command '{COMMAND_NAME}'") | ||||||
|  |  | ||||||
|  |     parser["crawl:ilias"] = {} | ||||||
|  |     section = parser["crawl:ilias"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = COMMAND_NAME | ||||||
|  |     load_common(section, args, parser) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								PFERD/cli/command_kit_ipd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     "kit-ipd", | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title="kit ipd crawler arguments", | ||||||
|  |     description="arguments for the 'kit-ipd' crawler", | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--link-regex", | ||||||
|  |     type=str, | ||||||
|  |     metavar="REGEX", | ||||||
|  |     help="href-matching regex to identify downloadable files" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "target", | ||||||
|  |     type=str, | ||||||
|  |     metavar="TARGET", | ||||||
|  |     help="url to crawl" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "output", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="OUTPUT", | ||||||
|  |     help="output directory" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain("Creating config for command 'kit-ipd'") | ||||||
|  |  | ||||||
|  |     parser["crawl:kit-ipd"] = {} | ||||||
|  |     section = parser["crawl:kit-ipd"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = "kit-ipd" | ||||||
|  |     section["target"] = str(args.target) | ||||||
|  |     section["output_dir"] = str(args.output) | ||||||
|  |     if args.link_regex: | ||||||
|  |         section["link_regex"] = str(args.link_regex) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								PFERD/cli/command_local.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from ..logging import log | ||||||
|  | from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||||
|  |  | ||||||
|  | SUBPARSER = SUBPARSERS.add_parser( | ||||||
|  |     "local", | ||||||
|  |     parents=[CRAWLER_PARSER], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | GROUP = SUBPARSER.add_argument_group( | ||||||
|  |     title="local crawler arguments", | ||||||
|  |     description="arguments for the 'local' crawler", | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "target", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="TARGET", | ||||||
|  |     help="directory to crawl" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "output", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="OUTPUT", | ||||||
|  |     help="output directory" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--crawl-delay", | ||||||
|  |     type=float, | ||||||
|  |     metavar="SECONDS", | ||||||
|  |     help="artificial delay to simulate for crawl requests" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--download-delay", | ||||||
|  |     type=float, | ||||||
|  |     metavar="SECONDS", | ||||||
|  |     help="artificial delay to simulate for download requests" | ||||||
|  | ) | ||||||
|  | GROUP.add_argument( | ||||||
|  |     "--download-speed", | ||||||
|  |     type=int, | ||||||
|  |     metavar="BYTES_PER_SECOND", | ||||||
|  |     help="download speed to simulate" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     log.explain("Creating config for command 'local'") | ||||||
|  |  | ||||||
|  |     parser["crawl:local"] = {} | ||||||
|  |     section = parser["crawl:local"] | ||||||
|  |     load_crawler(args, section) | ||||||
|  |  | ||||||
|  |     section["type"] = "local" | ||||||
|  |     section["target"] = str(args.target) | ||||||
|  |     section["output_dir"] = str(args.output) | ||||||
|  |     if args.crawl_delay is not None: | ||||||
|  |         section["crawl_delay"] = str(args.crawl_delay) | ||||||
|  |     if args.download_delay is not None: | ||||||
|  |         section["download_delay"] = str(args.download_delay) | ||||||
|  |     if args.download_speed is not None: | ||||||
|  |         section["download_speed"] = str(args.download_speed) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSER.set_defaults(command=load) | ||||||
							
								
								
									
										104
									
								
								PFERD/cli/common_ilias_args.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								PFERD/cli/common_ilias_args.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | from ..crawl.ilias.file_templates import Links | ||||||
|  | from .parser import BooleanOptionalAction, ParserLoadError, show_value_error | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def configure_common_group_args(group: argparse._ArgumentGroup) -> None: | ||||||
|  |     """These arguments are shared between the KIT and generic Ilias web command.""" | ||||||
|  |     group.add_argument( | ||||||
|  |         "target", | ||||||
|  |         type=str, | ||||||
|  |         metavar="TARGET", | ||||||
|  |         help="course id, 'desktop', or ILIAS URL to crawl" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "output", | ||||||
|  |         type=Path, | ||||||
|  |         metavar="OUTPUT", | ||||||
|  |         help="output directory" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--username", "-u", | ||||||
|  |         type=str, | ||||||
|  |         metavar="USERNAME", | ||||||
|  |         help="user name for authentication" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--keyring", | ||||||
|  |         action=BooleanOptionalAction, | ||||||
|  |         help="use the system keyring to store and retrieve passwords" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--credential-file", | ||||||
|  |         type=Path, | ||||||
|  |         metavar="PATH", | ||||||
|  |         help="read username and password from a credential file" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--links", | ||||||
|  |         type=show_value_error(Links.from_string), | ||||||
|  |         metavar="OPTION", | ||||||
|  |         help="how to represent external links" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--link-redirect-delay", | ||||||
|  |         type=int, | ||||||
|  |         metavar="SECONDS", | ||||||
|  |         help="time before 'fancy' links redirect to to their target (-1 to disable)" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--videos", | ||||||
|  |         action=BooleanOptionalAction, | ||||||
|  |         help="crawl and download videos" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--forums", | ||||||
|  |         action=BooleanOptionalAction, | ||||||
|  |         help="crawl and download forum posts" | ||||||
|  |     ) | ||||||
|  |     group.add_argument( | ||||||
|  |         "--http-timeout", "-t", | ||||||
|  |         type=float, | ||||||
|  |         metavar="SECONDS", | ||||||
|  |         help="timeout for all HTTP requests" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_common( | ||||||
|  |     section: configparser.SectionProxy, | ||||||
|  |     args: argparse.Namespace, | ||||||
|  |     parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     """Load common config between generic and KIT ilias web command""" | ||||||
|  |     section["target"] = str(args.target) | ||||||
|  |     section["output_dir"] = str(args.output) | ||||||
|  |     section["auth"] = "auth:ilias" | ||||||
|  |     if args.links is not None: | ||||||
|  |         section["links"] = str(args.links.value) | ||||||
|  |     if args.link_redirect_delay is not None: | ||||||
|  |         section["link_redirect_delay"] = str(args.link_redirect_delay) | ||||||
|  |     if args.videos is not None: | ||||||
|  |         section["videos"] = "yes" if args.videos else "no" | ||||||
|  |     if args.forums is not None: | ||||||
|  |         section["forums"] = "yes" if args.forums else "no" | ||||||
|  |     if args.http_timeout is not None: | ||||||
|  |         section["http_timeout"] = str(args.http_timeout) | ||||||
|  |  | ||||||
|  |     parser["auth:ilias"] = {} | ||||||
|  |     auth_section = parser["auth:ilias"] | ||||||
|  |     if args.credential_file is not None: | ||||||
|  |         if args.username is not None: | ||||||
|  |             raise ParserLoadError("--credential-file and --username can't be used together") | ||||||
|  |         if args.keyring: | ||||||
|  |             raise ParserLoadError("--credential-file and --keyring can't be used together") | ||||||
|  |         auth_section["type"] = "credential-file" | ||||||
|  |         auth_section["path"] = str(args.credential_file) | ||||||
|  |     elif args.keyring: | ||||||
|  |         auth_section["type"] = "keyring" | ||||||
|  |     else: | ||||||
|  |         auth_section["type"] = "simple" | ||||||
|  |     if args.username is not None: | ||||||
|  |         auth_section["username"] = args.username | ||||||
							
								
								
									
										245
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										245
									
								
								PFERD/cli/parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,245 @@ | |||||||
|  | import argparse | ||||||
|  | import configparser | ||||||
|  | from argparse import ArgumentTypeError | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Callable, List, Optional, Sequence, Union | ||||||
|  |  | ||||||
|  | from ..output_dir import OnConflict, Redownload | ||||||
|  | from ..version import NAME, VERSION | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ParserLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # TODO Replace with argparse version when updating to 3.9? | ||||||
|  | class BooleanOptionalAction(argparse.Action): | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             option_strings: List[str], | ||||||
|  |             dest: Any, | ||||||
|  |             default: Any = None, | ||||||
|  |             type: Any = None, | ||||||
|  |             choices: Any = None, | ||||||
|  |             required: Any = False, | ||||||
|  |             help: Any = None, | ||||||
|  |             metavar: Any = None, | ||||||
|  |     ): | ||||||
|  |         if len(option_strings) != 1: | ||||||
|  |             raise ValueError("There must be exactly one option string") | ||||||
|  |         [self.name] = option_strings | ||||||
|  |         if not self.name.startswith("--"): | ||||||
|  |             raise ValueError(f"{self.name!r} doesn't start with '--'") | ||||||
|  |         if self.name.startswith("--no-"): | ||||||
|  |             raise ValueError(f"{self.name!r} starts with '--no-'") | ||||||
|  |  | ||||||
|  |         options = [self.name, "--no-" + self.name[2:]] | ||||||
|  |  | ||||||
|  |         super().__init__( | ||||||
|  |             options, | ||||||
|  |             dest, | ||||||
|  |             nargs=0, | ||||||
|  |             default=default, | ||||||
|  |             type=type, | ||||||
|  |             choices=choices, | ||||||
|  |             required=required, | ||||||
|  |             help=help, | ||||||
|  |             metavar=metavar, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def __call__( | ||||||
|  |             self, | ||||||
|  |             parser: argparse.ArgumentParser, | ||||||
|  |             namespace: argparse.Namespace, | ||||||
|  |             values: Union[str, Sequence[Any], None], | ||||||
|  |             option_string: Optional[str] = None, | ||||||
|  |     ) -> None: | ||||||
|  |         if option_string and option_string in self.option_strings: | ||||||
|  |             value = not option_string.startswith("--no-") | ||||||
|  |             setattr(namespace, self.dest, value) | ||||||
|  |  | ||||||
|  |     def format_usage(self) -> str: | ||||||
|  |         return "--[no-]" + self.name[2:] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def show_value_error(inner: Callable[[str], Any]) -> Callable[[str], Any]: | ||||||
|  |     """ | ||||||
|  |     Some validation functions (like the from_string in our enums) raise a ValueError. | ||||||
|  |     Argparse only pretty-prints ArgumentTypeErrors though, so we need to wrap our ValueErrors. | ||||||
|  |     """ | ||||||
|  |     def wrapper(input: str) -> Any: | ||||||
|  |         try: | ||||||
|  |             return inner(input) | ||||||
|  |         except ValueError as e: | ||||||
|  |             raise ArgumentTypeError(e) | ||||||
|  |     return wrapper | ||||||
|  |  | ||||||
|  |  | ||||||
|  | CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) | ||||||
|  | CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( | ||||||
|  |     title="general crawler arguments", | ||||||
|  |     description="arguments common to all crawlers", | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--redownload", "-r", | ||||||
|  |     type=show_value_error(Redownload.from_string), | ||||||
|  |     metavar="OPTION", | ||||||
|  |     help="when to download a file that's already present locally" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--on-conflict", | ||||||
|  |     type=show_value_error(OnConflict.from_string), | ||||||
|  |     metavar="OPTION", | ||||||
|  |     help="what to do when local and remote files or directories differ" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--transform", "-T", | ||||||
|  |     action="append", | ||||||
|  |     type=str, | ||||||
|  |     metavar="RULE", | ||||||
|  |     help="add a single transformation rule. Can be specified multiple times" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--tasks", "-n", | ||||||
|  |     type=int, | ||||||
|  |     metavar="N", | ||||||
|  |     help="maximum number of concurrent tasks (crawling, downloading)" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--downloads", "-N", | ||||||
|  |     type=int, | ||||||
|  |     metavar="N", | ||||||
|  |     help="maximum number of tasks that may download data at the same time" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--task-delay", "-d", | ||||||
|  |     type=float, | ||||||
|  |     metavar="SECONDS", | ||||||
|  |     help="time the crawler should wait between subsequent tasks" | ||||||
|  | ) | ||||||
|  | CRAWLER_PARSER_GROUP.add_argument( | ||||||
|  |     "--windows-paths", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="whether to repair invalid paths on windows" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_crawler( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         section: configparser.SectionProxy, | ||||||
|  | ) -> None: | ||||||
|  |     if args.redownload is not None: | ||||||
|  |         section["redownload"] = args.redownload.value | ||||||
|  |     if args.on_conflict is not None: | ||||||
|  |         section["on_conflict"] = args.on_conflict.value | ||||||
|  |     if args.transform is not None: | ||||||
|  |         section["transform"] = "\n" + "\n".join(args.transform) | ||||||
|  |     if args.tasks is not None: | ||||||
|  |         section["tasks"] = str(args.tasks) | ||||||
|  |     if args.downloads is not None: | ||||||
|  |         section["downloads"] = str(args.downloads) | ||||||
|  |     if args.task_delay is not None: | ||||||
|  |         section["task_delay"] = str(args.task_delay) | ||||||
|  |     if args.windows_paths is not None: | ||||||
|  |         section["windows_paths"] = "yes" if args.windows_paths else "no" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PARSER = argparse.ArgumentParser() | ||||||
|  | PARSER.set_defaults(command=None) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--version", | ||||||
|  |     action="version", | ||||||
|  |     version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--config", "-c", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="PATH", | ||||||
|  |     help="custom config file" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--dump-config", | ||||||
|  |     action="store_true", | ||||||
|  |     help="dump current configuration to the default config path and exit" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--dump-config-to", | ||||||
|  |     metavar="PATH", | ||||||
|  |     help="dump current configuration to a file and exit." | ||||||
|  |     " Use '-' as path to print to stdout instead" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--debug-transforms", | ||||||
|  |     action="store_true", | ||||||
|  |     help="apply transform rules to files of previous run" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--crawler", "-C", | ||||||
|  |     action="append", | ||||||
|  |     type=str, | ||||||
|  |     metavar="NAME", | ||||||
|  |     help="only execute a single crawler." | ||||||
|  |     " Can be specified multiple times to execute multiple crawlers" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--skip", "-S", | ||||||
|  |     action="append", | ||||||
|  |     type=str, | ||||||
|  |     metavar="NAME", | ||||||
|  |     help="don't execute this particular crawler." | ||||||
|  |     " Can be specified multiple times to skip multiple crawlers" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--working-dir", | ||||||
|  |     type=Path, | ||||||
|  |     metavar="PATH", | ||||||
|  |     help="custom working directory" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--explain", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="log and explain in detail what PFERD is doing" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--status", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="print status updates while PFERD is crawling" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--report", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="print a report of all local changes before exiting" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--share-cookies", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="whether crawlers should share cookies where applicable" | ||||||
|  | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--show-not-deleted", | ||||||
|  |     action=BooleanOptionalAction, | ||||||
|  |     help="print messages in status and report when PFERD did not delete a local only file" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_default_section( | ||||||
|  |         args: argparse.Namespace, | ||||||
|  |         parser: configparser.ConfigParser, | ||||||
|  | ) -> None: | ||||||
|  |     section = parser[parser.default_section] | ||||||
|  |  | ||||||
|  |     if args.working_dir is not None: | ||||||
|  |         section["working_dir"] = str(args.working_dir) | ||||||
|  |     if args.explain is not None: | ||||||
|  |         section["explain"] = "yes" if args.explain else "no" | ||||||
|  |     if args.status is not None: | ||||||
|  |         section["status"] = "yes" if args.status else "no" | ||||||
|  |     if args.report is not None: | ||||||
|  |         section["report"] = "yes" if args.report else "no" | ||||||
|  |     if args.share_cookies is not None: | ||||||
|  |         section["share_cookies"] = "yes" if args.share_cookies else "no" | ||||||
|  |     if args.show_not_deleted is not None: | ||||||
|  |         section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | SUBPARSERS = PARSER.add_subparsers(title="crawlers") | ||||||
							
								
								
									
										193
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										193
									
								
								PFERD/config.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,193 @@ | |||||||
|  | import asyncio | ||||||
|  | import os | ||||||
|  | import sys | ||||||
|  | from configparser import ConfigParser, SectionProxy | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, List, NoReturn, Optional, Tuple | ||||||
|  |  | ||||||
|  | from rich.markup import escape | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_real_path, prompt_yes_no | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConfigLoadError(Exception): | ||||||
|  |     """ | ||||||
|  |     Something went wrong while loading the config from a file. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, path: Path, reason: str): | ||||||
|  |         super().__init__(f"Failed to load config from {fmt_real_path(path)}") | ||||||
|  |         self.path = path | ||||||
|  |         self.reason = reason | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConfigOptionError(Exception): | ||||||
|  |     """ | ||||||
|  |     An option in the config file has an invalid or missing value. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, section: str, key: str, desc: str): | ||||||
|  |         super().__init__(f"Section {section!r}, key {key!r}: {desc}") | ||||||
|  |         self.section = section | ||||||
|  |         self.key = key | ||||||
|  |         self.desc = desc | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConfigDumpError(Exception): | ||||||
|  |     def __init__(self, path: Path, reason: str): | ||||||
|  |         super().__init__(f"Failed to dump config to {fmt_real_path(path)}") | ||||||
|  |         self.path = path | ||||||
|  |         self.reason = reason | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Section: | ||||||
|  |     """ | ||||||
|  |     Base class for the crawler and auth section classes. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, section: SectionProxy): | ||||||
|  |         self.s = section | ||||||
|  |  | ||||||
|  |     def error(self, key: str, desc: str) -> NoReturn: | ||||||
|  |         raise ConfigOptionError(self.s.name, key, desc) | ||||||
|  |  | ||||||
|  |     def invalid_value( | ||||||
|  |             self, | ||||||
|  |             key: str, | ||||||
|  |             value: Any, | ||||||
|  |             reason: Optional[str], | ||||||
|  |     ) -> NoReturn: | ||||||
|  |         if reason is None: | ||||||
|  |             self.error(key, f"Invalid value {value!r}") | ||||||
|  |         else: | ||||||
|  |             self.error(key, f"Invalid value {value!r}: {reason}") | ||||||
|  |  | ||||||
|  |     def missing_value(self, key: str) -> NoReturn: | ||||||
|  |         self.error(key, "Missing value") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DefaultSection(Section): | ||||||
|  |     def working_dir(self) -> Path: | ||||||
|  |         # TODO Change to working dir instead of manually prepending it to paths | ||||||
|  |         pathstr = self.s.get("working_dir", ".") | ||||||
|  |         return Path(pathstr).expanduser() | ||||||
|  |  | ||||||
|  |     def explain(self) -> bool: | ||||||
|  |         return self.s.getboolean("explain", fallback=False) | ||||||
|  |  | ||||||
|  |     def status(self) -> bool: | ||||||
|  |         return self.s.getboolean("status", fallback=True) | ||||||
|  |  | ||||||
|  |     def report(self) -> bool: | ||||||
|  |         return self.s.getboolean("report", fallback=True) | ||||||
|  |  | ||||||
|  |     def show_not_deleted(self) -> bool: | ||||||
|  |         return self.s.getboolean("show_not_deleted", fallback=True) | ||||||
|  |  | ||||||
|  |     def share_cookies(self) -> bool: | ||||||
|  |         return self.s.getboolean("share_cookies", fallback=True) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Config: | ||||||
|  |     @staticmethod | ||||||
|  |     def _default_path() -> Path: | ||||||
|  |         if os.name == "posix": | ||||||
|  |             return Path("~/.config/PFERD/pferd.cfg").expanduser() | ||||||
|  |         elif os.name == "nt": | ||||||
|  |             return Path("~/AppData/Roaming/PFERD/pferd.cfg").expanduser() | ||||||
|  |         else: | ||||||
|  |             return Path("~/.pferd.cfg").expanduser() | ||||||
|  |  | ||||||
|  |     def __init__(self, parser: ConfigParser): | ||||||
|  |         self._parser = parser | ||||||
|  |         self._default_section = DefaultSection(parser[parser.default_section]) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def default_section(self) -> DefaultSection: | ||||||
|  |         return self._default_section | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: | ||||||
|  |         """ | ||||||
|  |         May throw a ConfigLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if path: | ||||||
|  |             log.explain("Path specified on CLI") | ||||||
|  |         else: | ||||||
|  |             log.explain("Using default path") | ||||||
|  |             path = Config._default_path() | ||||||
|  |         log.explain(f"Loading {fmt_real_path(path)}") | ||||||
|  |  | ||||||
|  |         # Using config.read_file instead of config.read because config.read | ||||||
|  |         # would just ignore a missing file and carry on. | ||||||
|  |         try: | ||||||
|  |             with open(path, encoding="utf-8") as f: | ||||||
|  |                 parser.read_file(f, source=str(path)) | ||||||
|  |         except FileNotFoundError: | ||||||
|  |             raise ConfigLoadError(path, "File does not exist") | ||||||
|  |         except IsADirectoryError: | ||||||
|  |             raise ConfigLoadError(path, "That's a directory, not a file") | ||||||
|  |         except PermissionError: | ||||||
|  |             raise ConfigLoadError(path, "Insufficient permissions") | ||||||
|  |         except UnicodeDecodeError: | ||||||
|  |             raise ConfigLoadError(path, "File is not encoded using UTF-8") | ||||||
|  |  | ||||||
|  |     def dump(self, path: Optional[Path] = None) -> None: | ||||||
|  |         """ | ||||||
|  |         May throw a ConfigDumpError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if path: | ||||||
|  |             log.explain("Using custom path") | ||||||
|  |         else: | ||||||
|  |             log.explain("Using default path") | ||||||
|  |             path = self._default_path() | ||||||
|  |  | ||||||
|  |         log.explain(f"Dumping to {fmt_real_path(path)}") | ||||||
|  |         log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             path.parent.mkdir(parents=True, exist_ok=True) | ||||||
|  |         except PermissionError: | ||||||
|  |             raise ConfigDumpError(path, "Could not create parent directory") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             # Ensuring we don't accidentally overwrite any existing files by | ||||||
|  |             # always asking before overwriting a file. | ||||||
|  |             try: | ||||||
|  |                 # x = open for exclusive creation, failing if the file already | ||||||
|  |                 # exists | ||||||
|  |                 with open(path, "x", encoding="utf-8") as f: | ||||||
|  |                     self._parser.write(f) | ||||||
|  |             except FileExistsError: | ||||||
|  |                 print("That file already exists.") | ||||||
|  |                 if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): | ||||||
|  |                     with open(path, "w", encoding="utf-8") as f: | ||||||
|  |                         self._parser.write(f) | ||||||
|  |                 else: | ||||||
|  |                     raise ConfigDumpError(path, "File already exists") | ||||||
|  |         except IsADirectoryError: | ||||||
|  |             raise ConfigDumpError(path, "That's a directory, not a file") | ||||||
|  |         except PermissionError: | ||||||
|  |             raise ConfigDumpError(path, "Insufficient permissions") | ||||||
|  |  | ||||||
|  |     def dump_to_stdout(self) -> None: | ||||||
|  |         self._parser.write(sys.stdout) | ||||||
|  |  | ||||||
|  |     def crawl_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||||
|  |         result = [] | ||||||
|  |         for name, proxy in self._parser.items(): | ||||||
|  |             if name.startswith("crawl:"): | ||||||
|  |                 result.append((name, proxy)) | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     def auth_sections(self) -> List[Tuple[str, SectionProxy]]: | ||||||
|  |         result = [] | ||||||
|  |         for name, proxy in self._parser.items(): | ||||||
|  |             if name.startswith("auth:"): | ||||||
|  |                 result.append((name, proxy)) | ||||||
|  |  | ||||||
|  |         return result | ||||||
							
								
								
									
										27
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								PFERD/crawl/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | from configparser import SectionProxy | ||||||
|  | from typing import Callable, Dict | ||||||
|  |  | ||||||
|  | from ..auth import Authenticator | ||||||
|  | from ..config import Config | ||||||
|  | from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401 | ||||||
|  | from .ilias import IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||||
|  | from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection | ||||||
|  | from .local_crawler import LocalCrawler, LocalCrawlerSection | ||||||
|  |  | ||||||
|  | CrawlerConstructor = Callable[[ | ||||||
|  |     str,                       # Name (without the "crawl:" prefix) | ||||||
|  |     SectionProxy,              # Crawler's section of global config | ||||||
|  |     Config,                    # Global config | ||||||
|  |     Dict[str, Authenticator],  # Loaded authenticators by name | ||||||
|  | ], Crawler] | ||||||
|  |  | ||||||
|  | CRAWLERS: Dict[str, CrawlerConstructor] = { | ||||||
|  |     "local": lambda n, s, c, a: | ||||||
|  |         LocalCrawler(n, LocalCrawlerSection(s), c), | ||||||
|  |     "ilias-web": lambda n, s, c, a: | ||||||
|  |         IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a), | ||||||
|  |     "kit-ilias-web": lambda n, s, c, a: | ||||||
|  |         KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), | ||||||
|  |     "kit-ipd": lambda n, s, c, a: | ||||||
|  |         KitIpdCrawler(n, KitIpdCrawlerSection(s), c), | ||||||
|  | } | ||||||
							
								
								
									
										409
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										409
									
								
								PFERD/crawl/crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,409 @@ | |||||||
|  | import asyncio | ||||||
|  | import os | ||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from collections.abc import Awaitable, Coroutine | ||||||
|  | from datetime import datetime | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar | ||||||
|  |  | ||||||
|  | from ..auth import Authenticator | ||||||
|  | from ..config import Config, Section | ||||||
|  | from ..deduplicator import Deduplicator | ||||||
|  | from ..limiter import Limiter | ||||||
|  | from ..logging import ProgressBar, log | ||||||
|  | from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload | ||||||
|  | from ..report import MarkConflictError, MarkDuplicateError, Report | ||||||
|  | from ..transformer import Transformer | ||||||
|  | from ..utils import ReusableAsyncContextManager, fmt_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlWarning(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def noncritical(f: Wrapped) -> Wrapped: | ||||||
|  |     """ | ||||||
|  |     Catches and logs a few noncritical exceptions occurring during the function | ||||||
|  |     call, mainly CrawlWarning. | ||||||
|  |  | ||||||
|  |     If any exception occurs during the function call, the crawler's error_free | ||||||
|  |     variable is set to False. This includes noncritical exceptions. | ||||||
|  |  | ||||||
|  |     Warning: Must only be applied to member functions of the Crawler class! | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def wrapper(*args: Any, **kwargs: Any) -> None: | ||||||
|  |         if not (args and isinstance(args[0], Crawler)): | ||||||
|  |             raise RuntimeError("@noncritical must only applied to Crawler methods") | ||||||
|  |  | ||||||
|  |         crawler = args[0] | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             f(*args, **kwargs) | ||||||
|  |         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||||
|  |             crawler.report.add_warning(str(e)) | ||||||
|  |             log.warn(str(e)) | ||||||
|  |             crawler.error_free = False | ||||||
|  |         except Exception as e: | ||||||
|  |             crawler.error_free = False | ||||||
|  |             crawler.report.add_error(str(e)) | ||||||
|  |             raise | ||||||
|  |  | ||||||
|  |     return wrapper  # type: ignore | ||||||
|  |  | ||||||
|  |  | ||||||
|  | AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def anoncritical(f: AWrapped) -> AWrapped: | ||||||
|  |     """ | ||||||
|  |     An async version of @noncritical. | ||||||
|  |  | ||||||
|  |     Catches and logs a few noncritical exceptions occurring during the function | ||||||
|  |     call, mainly CrawlWarning. | ||||||
|  |  | ||||||
|  |     If any exception occurs during the function call, the crawler's error_free | ||||||
|  |     variable is set to False. This includes noncritical exceptions. | ||||||
|  |  | ||||||
|  |     Warning: Must only be applied to member functions of the Crawler class! | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||||
|  |         if not (args and isinstance(args[0], Crawler)): | ||||||
|  |             raise RuntimeError("@anoncritical must only applied to Crawler methods") | ||||||
|  |  | ||||||
|  |         crawler = args[0] | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             return await f(*args, **kwargs) | ||||||
|  |         except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: | ||||||
|  |             log.warn(str(e)) | ||||||
|  |             crawler.error_free = False | ||||||
|  |             crawler.report.add_warning(str(e)) | ||||||
|  |         except Exception as e: | ||||||
|  |             crawler.error_free = False | ||||||
|  |             crawler.report.add_error(str(e)) | ||||||
|  |             raise | ||||||
|  |  | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return wrapper  # type: ignore | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlToken(ReusableAsyncContextManager[ProgressBar]): | ||||||
|  |     def __init__(self, limiter: Limiter, path: PurePath): | ||||||
|  |         super().__init__() | ||||||
|  |  | ||||||
|  |         self._limiter = limiter | ||||||
|  |         self._path = path | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def path(self) -> PurePath: | ||||||
|  |         return self._path | ||||||
|  |  | ||||||
|  |     async def _on_aenter(self) -> ProgressBar: | ||||||
|  |         self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path))) | ||||||
|  |         await self._stack.enter_async_context(self._limiter.limit_crawl()) | ||||||
|  |         bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path))) | ||||||
|  |  | ||||||
|  |         return bar | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): | ||||||
|  |     def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath): | ||||||
|  |         super().__init__() | ||||||
|  |  | ||||||
|  |         self._limiter = limiter | ||||||
|  |         self._fs_token = fs_token | ||||||
|  |         self._path = path | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def path(self) -> PurePath: | ||||||
|  |         return self._path | ||||||
|  |  | ||||||
|  |     async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: | ||||||
|  |         await self._stack.enter_async_context(self._limiter.limit_download()) | ||||||
|  |         sink = await self._stack.enter_async_context(self._fs_token) | ||||||
|  |         # The "Downloaded ..." message is printed in the output dir, not here | ||||||
|  |         bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading", | ||||||
|  |                                                          fmt_path(self._path))) | ||||||
|  |  | ||||||
|  |         return bar, sink | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class CrawlerSection(Section): | ||||||
|  |     def type(self) -> str: | ||||||
|  |         value = self.s.get("type") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("type") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def skip(self) -> bool: | ||||||
|  |         return self.s.getboolean("skip", fallback=False) | ||||||
|  |  | ||||||
|  |     def output_dir(self, name: str) -> Path: | ||||||
|  |         name = name.removeprefix("crawl:") | ||||||
|  |         return Path(self.s.get("output_dir", name)).expanduser() | ||||||
|  |  | ||||||
|  |     def redownload(self) -> Redownload: | ||||||
|  |         value = self.s.get("redownload", "never-smart") | ||||||
|  |         try: | ||||||
|  |             return Redownload.from_string(value) | ||||||
|  |         except ValueError as e: | ||||||
|  |             self.invalid_value( | ||||||
|  |                 "redownload", | ||||||
|  |                 value, | ||||||
|  |                 str(e).capitalize(), | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def on_conflict(self) -> OnConflict: | ||||||
|  |         value = self.s.get("on_conflict", "prompt") | ||||||
|  |         try: | ||||||
|  |             return OnConflict.from_string(value) | ||||||
|  |         except ValueError as e: | ||||||
|  |             self.invalid_value( | ||||||
|  |                 "on_conflict", | ||||||
|  |                 value, | ||||||
|  |                 str(e).capitalize(), | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def transform(self) -> str: | ||||||
|  |         return self.s.get("transform", "") | ||||||
|  |  | ||||||
|  |     def tasks(self) -> int: | ||||||
|  |         value = self.s.getint("tasks", fallback=1) | ||||||
|  |         if value <= 0: | ||||||
|  |             self.invalid_value("tasks", value, "Must be greater than 0") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def downloads(self) -> int: | ||||||
|  |         tasks = self.tasks() | ||||||
|  |         value = self.s.getint("downloads", fallback=None) | ||||||
|  |         if value is None: | ||||||
|  |             return tasks | ||||||
|  |         if value <= 0: | ||||||
|  |             self.invalid_value("downloads", value, "Must be greater than 0") | ||||||
|  |         if value > tasks: | ||||||
|  |             self.invalid_value("downloads", value, "Must not be greater than tasks") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def task_delay(self) -> float: | ||||||
|  |         value = self.s.getfloat("task_delay", fallback=0.0) | ||||||
|  |         if value < 0: | ||||||
|  |             self.invalid_value("task_delay", value, "Must not be negative") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def windows_paths(self) -> bool: | ||||||
|  |         on_windows = os.name == "nt" | ||||||
|  |         return self.s.getboolean("windows_paths", fallback=on_windows) | ||||||
|  |  | ||||||
|  |     def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: | ||||||
|  |         value = self.s.get("auth") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("auth") | ||||||
|  |         auth = authenticators.get(value) | ||||||
|  |         if auth is None: | ||||||
|  |             self.invalid_value("auth", value, "No such auth section exists") | ||||||
|  |         return auth | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Crawler(ABC): | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: CrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |     ) -> None: | ||||||
|  |         """ | ||||||
|  |         Initialize a crawler from its name and its section in the config file. | ||||||
|  |  | ||||||
|  |         If you are writing your own constructor for your own crawler, make sure | ||||||
|  |         to call this constructor first (via super().__init__). | ||||||
|  |  | ||||||
|  |         May throw a CrawlerLoadException. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.name = name | ||||||
|  |         self.error_free = True | ||||||
|  |  | ||||||
|  |         self._limiter = Limiter( | ||||||
|  |             task_limit=section.tasks(), | ||||||
|  |             download_limit=section.downloads(), | ||||||
|  |             task_delay=section.task_delay(), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         self._deduplicator = Deduplicator(section.windows_paths()) | ||||||
|  |         self._transformer = Transformer(section.transform()) | ||||||
|  |  | ||||||
|  |         self._output_dir = OutputDirectory( | ||||||
|  |             config.default_section.working_dir() / section.output_dir(name), | ||||||
|  |             section.redownload(), | ||||||
|  |             section.on_conflict(), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def report(self) -> Report: | ||||||
|  |         return self._output_dir.report | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def prev_report(self) -> Optional[Report]: | ||||||
|  |         return self._output_dir.prev_report | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def output_dir(self) -> OutputDirectory: | ||||||
|  |         return self._output_dir | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: | ||||||
|  |         """ | ||||||
|  |         Similar to asyncio.gather. However, in the case of an exception, all | ||||||
|  |         still running tasks are cancelled and the exception is rethrown. | ||||||
|  |  | ||||||
|  |         This should always be preferred over asyncio.gather in crawler code so | ||||||
|  |         that an exception like CrawlError may actually stop the crawler. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         tasks = [asyncio.ensure_future(aw) for aw in awaitables] | ||||||
|  |         result = asyncio.gather(*tasks) | ||||||
|  |         try: | ||||||
|  |             return await result | ||||||
|  |         except:  # noqa: E722 | ||||||
|  |             for task in tasks: | ||||||
|  |                 task.cancel() | ||||||
|  |             raise | ||||||
|  |  | ||||||
|  |     async def crawl(self, path: PurePath) -> Optional[CrawlToken]: | ||||||
|  |         log.explain_topic(f"Decision: Crawl {fmt_path(path)}") | ||||||
|  |         path = self._deduplicator.mark(path) | ||||||
|  |         self._output_dir.report.found(path) | ||||||
|  |  | ||||||
|  |         if self._transformer.transform(path) is None: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         log.explain("Answer: Yes") | ||||||
|  |         return CrawlToken(self._limiter, path) | ||||||
|  |  | ||||||
|  |     def should_try_download( | ||||||
|  |             self, | ||||||
|  |             path: PurePath, | ||||||
|  |             *, | ||||||
|  |             etag_differs: Optional[bool] = None, | ||||||
|  |             mtime: Optional[datetime] = None, | ||||||
|  |             redownload: Optional[Redownload] = None, | ||||||
|  |             on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> bool: | ||||||
|  |         log.explain_topic(f"Decision: Should Download {fmt_path(path)}") | ||||||
|  |  | ||||||
|  |         if self._transformer.transform(path) is None: | ||||||
|  |             log.explain("Answer: No (ignored)") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         should_download = self._output_dir.should_try_download( | ||||||
|  |             path, | ||||||
|  |             etag_differs=etag_differs, | ||||||
|  |             mtime=mtime, | ||||||
|  |             redownload=redownload, | ||||||
|  |             on_conflict=on_conflict | ||||||
|  |         ) | ||||||
|  |         if should_download: | ||||||
|  |             log.explain("Answer: Yes") | ||||||
|  |             return True | ||||||
|  |         else: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |     async def download( | ||||||
|  |             self, | ||||||
|  |             path: PurePath, | ||||||
|  |             *, | ||||||
|  |             etag_differs: Optional[bool] = None, | ||||||
|  |             mtime: Optional[datetime] = None, | ||||||
|  |             redownload: Optional[Redownload] = None, | ||||||
|  |             on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> Optional[DownloadToken]: | ||||||
|  |         log.explain_topic(f"Decision: Download {fmt_path(path)}") | ||||||
|  |         path = self._deduplicator.mark(path) | ||||||
|  |         self._output_dir.report.found(path) | ||||||
|  |  | ||||||
|  |         transformed_path = self._transformer.transform(path) | ||||||
|  |         if transformed_path is None: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         fs_token = await self._output_dir.download( | ||||||
|  |             path, | ||||||
|  |             transformed_path, | ||||||
|  |             etag_differs=etag_differs, | ||||||
|  |             mtime=mtime, | ||||||
|  |             redownload=redownload, | ||||||
|  |             on_conflict=on_conflict | ||||||
|  |         ) | ||||||
|  |         if fs_token is None: | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         log.explain("Answer: Yes") | ||||||
|  |         return DownloadToken(self._limiter, fs_token, path) | ||||||
|  |  | ||||||
|  |     async def _cleanup(self) -> None: | ||||||
|  |         log.explain_topic("Decision: Clean up files") | ||||||
|  |         if self.error_free: | ||||||
|  |             log.explain("No warnings or errors occurred during this run") | ||||||
|  |             log.explain("Answer: Yes") | ||||||
|  |             await self._output_dir.cleanup() | ||||||
|  |         else: | ||||||
|  |             log.explain("Warnings or errors occurred during this run") | ||||||
|  |             log.explain("Answer: No") | ||||||
|  |  | ||||||
|  |     @anoncritical | ||||||
|  |     async def run(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Start the crawling process. Call this function if you want to use a | ||||||
|  |         crawler. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         with log.show_progress(): | ||||||
|  |             self._output_dir.prepare() | ||||||
|  |             self._output_dir.load_prev_report() | ||||||
|  |             await self._run() | ||||||
|  |             await self._cleanup() | ||||||
|  |             self._output_dir.store_report() | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     async def _run(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Overwrite this function if you are writing a crawler. | ||||||
|  |  | ||||||
|  |         This function must not return before all crawling is complete. To crawl | ||||||
|  |         multiple things concurrently, asyncio.gather can be used. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |     def debug_transforms(self) -> None: | ||||||
|  |         self._output_dir.load_prev_report() | ||||||
|  |  | ||||||
|  |         if not self.prev_report: | ||||||
|  |             log.warn("Couldn't find or load old report") | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         seen: Set[PurePath] = set() | ||||||
|  |         for known in sorted(self.prev_report.found_paths): | ||||||
|  |             looking_at = list(reversed(known.parents)) + [known] | ||||||
|  |             for path in looking_at: | ||||||
|  |                 if path in seen: | ||||||
|  |                     continue | ||||||
|  |  | ||||||
|  |                 log.explain_topic(f"Transforming {fmt_path(path)}") | ||||||
|  |                 self._transformer.transform(path) | ||||||
|  |                 seen.add(path) | ||||||
							
								
								
									
										281
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										281
									
								
								PFERD/crawl/http_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,281 @@ | |||||||
|  | import asyncio | ||||||
|  | import http.cookies | ||||||
|  | import ssl | ||||||
|  | from datetime import datetime | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Any, Dict, List, Optional, Tuple, cast | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  | import certifi | ||||||
|  | from aiohttp.client import ClientTimeout | ||||||
|  | from bs4 import Tag | ||||||
|  |  | ||||||
|  | from ..auth import Authenticator | ||||||
|  | from ..config import Config | ||||||
|  | from ..logging import log | ||||||
|  | from ..utils import fmt_real_path | ||||||
|  | from ..version import NAME, VERSION | ||||||
|  | from .crawler import Crawler, CrawlerSection | ||||||
|  |  | ||||||
|  | ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class HttpCrawlerSection(CrawlerSection): | ||||||
|  |     def http_timeout(self) -> float: | ||||||
|  |         return self.s.getfloat("http_timeout", fallback=30) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class HttpCrawler(Crawler): | ||||||
|  |     COOKIE_FILE = PurePath(".cookies") | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: HttpCrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |             shared_auth: Optional[Authenticator] = None, | ||||||
|  |     ) -> None: | ||||||
|  |         super().__init__(name, section, config) | ||||||
|  |  | ||||||
|  |         self._authentication_id = 0 | ||||||
|  |         self._authentication_lock = asyncio.Lock() | ||||||
|  |         self._request_count = 0 | ||||||
|  |         self._http_timeout = section.http_timeout() | ||||||
|  |  | ||||||
|  |         self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) | ||||||
|  |         self._shared_cookie_jar_paths: Optional[List[Path]] = None | ||||||
|  |         self._shared_auth = shared_auth | ||||||
|  |  | ||||||
|  |         self._output_dir.register_reserved(self.COOKIE_FILE) | ||||||
|  |  | ||||||
|  |     async def _current_auth_id(self) -> int: | ||||||
|  |         """ | ||||||
|  |         Returns the id for the current authentication, i.e. an identifier for the last | ||||||
|  |         successful call to [authenticate]. | ||||||
|  |  | ||||||
|  |         This method must be called before any request that might authenticate is made, so the | ||||||
|  |         HttpCrawler can properly track when [authenticate] can return early and when actual | ||||||
|  |         authentication is necessary. | ||||||
|  |         """ | ||||||
|  |         # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. | ||||||
|  |         # This should reduce the amount of requests we make: If an authentication is in progress | ||||||
|  |         # all future requests wait for authentication to complete. | ||||||
|  |         async with self._authentication_lock: | ||||||
|  |             self._request_count += 1 | ||||||
|  |             return self._authentication_id | ||||||
|  |  | ||||||
|  |     async def authenticate(self, caller_auth_id: int) -> None: | ||||||
|  |         """ | ||||||
|  |         Starts the authentication process. The main work is offloaded to _authenticate, which | ||||||
|  |         you should overwrite in a subclass if needed. This method should *NOT* be overwritten. | ||||||
|  |  | ||||||
|  |         The [caller_auth_id] should be the result of a [_current_auth_id] call made *before* | ||||||
|  |         the request was made. This ensures that authentication is not performed needlessly. | ||||||
|  |         """ | ||||||
|  |         async with self._authentication_lock: | ||||||
|  |             log.explain_topic("Authenticating") | ||||||
|  |             # Another thread successfully called authenticate in-between | ||||||
|  |             # We do not want to perform auth again, so we return here. We can | ||||||
|  |             # assume the other thread suceeded as authenticate will throw an error | ||||||
|  |             # if it failed and aborts the crawl process. | ||||||
|  |             if caller_auth_id != self._authentication_id: | ||||||
|  |                 log.explain( | ||||||
|  |                     "Authentication skipped due to auth id mismatch." | ||||||
|  |                     "A previous authentication beat us to the race." | ||||||
|  |                 ) | ||||||
|  |                 return | ||||||
|  |             log.explain("Calling crawler-specific authenticate") | ||||||
|  |             await self._authenticate() | ||||||
|  |             self._authentication_id += 1 | ||||||
|  |             # Saving the cookies after the first auth ensures we won't need to re-authenticate | ||||||
|  |             # on the next run, should this one be aborted or crash | ||||||
|  |             self._save_cookies() | ||||||
|  |  | ||||||
|  |     async def _authenticate(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Performs authentication. This method must only return normally if authentication suceeded. | ||||||
|  |         In all other cases it must either retry internally or throw a terminal exception. | ||||||
|  |         """ | ||||||
|  |         raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") | ||||||
|  |  | ||||||
|  |     def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None: | ||||||
|  |         if not self._shared_auth: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         if self._shared_auth in shared: | ||||||
|  |             self._shared_cookie_jar_paths = shared[self._shared_auth] | ||||||
|  |         else: | ||||||
|  |             self._shared_cookie_jar_paths = [] | ||||||
|  |             shared[self._shared_auth] = self._shared_cookie_jar_paths | ||||||
|  |  | ||||||
|  |         self._shared_cookie_jar_paths.append(self._cookie_jar_path) | ||||||
|  |  | ||||||
|  |     def _load_cookies_from_file(self, path: Path) -> None: | ||||||
|  |         jar: Any = http.cookies.SimpleCookie() | ||||||
|  |         with open(path, encoding="utf-8") as f: | ||||||
|  |             for i, line in enumerate(f): | ||||||
|  |                 # Names of headers are case insensitive | ||||||
|  |                 if line[:11].lower() == "set-cookie:": | ||||||
|  |                     jar.load(line[11:]) | ||||||
|  |                 else: | ||||||
|  |                     log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it") | ||||||
|  |         self._cookie_jar.update_cookies(jar) | ||||||
|  |  | ||||||
|  |     def _save_cookies_to_file(self, path: Path) -> None: | ||||||
|  |         jar: Any = http.cookies.SimpleCookie() | ||||||
|  |         for morsel in self._cookie_jar: | ||||||
|  |             jar[morsel.key] = morsel | ||||||
|  |         with open(path, "w", encoding="utf-8") as f: | ||||||
|  |             f.write(jar.output(sep="\n")) | ||||||
|  |             f.write("\n")  # A trailing newline is just common courtesy | ||||||
|  |  | ||||||
|  |     def _load_cookies(self) -> None: | ||||||
|  |         log.explain_topic("Loading cookies") | ||||||
|  |  | ||||||
|  |         cookie_jar_path: Optional[Path] = None | ||||||
|  |  | ||||||
|  |         if self._shared_cookie_jar_paths is None: | ||||||
|  |             log.explain("Not sharing any cookies") | ||||||
|  |             cookie_jar_path = self._cookie_jar_path | ||||||
|  |         else: | ||||||
|  |             log.explain("Sharing cookies") | ||||||
|  |             max_mtime: Optional[float] = None | ||||||
|  |             for path in self._shared_cookie_jar_paths: | ||||||
|  |                 if not path.is_file(): | ||||||
|  |                     log.explain(f"{fmt_real_path(path)} is not a file") | ||||||
|  |                     continue | ||||||
|  |                 mtime = path.stat().st_mtime | ||||||
|  |                 if max_mtime is None or mtime > max_mtime: | ||||||
|  |                     log.explain(f"{fmt_real_path(path)} has newest mtime so far") | ||||||
|  |                     max_mtime = mtime | ||||||
|  |                     cookie_jar_path = path | ||||||
|  |                 else: | ||||||
|  |                     log.explain(f"{fmt_real_path(path)} has older mtime") | ||||||
|  |  | ||||||
|  |         if cookie_jar_path is None: | ||||||
|  |             log.explain("Couldn't find a suitable cookie file") | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") | ||||||
|  |         try: | ||||||
|  |             self._load_cookies_from_file(cookie_jar_path) | ||||||
|  |         except Exception as e: | ||||||
|  |             log.explain("Failed to load cookies") | ||||||
|  |             log.explain(str(e)) | ||||||
|  |  | ||||||
|  |     def _save_cookies(self) -> None: | ||||||
|  |         log.explain_topic("Saving cookies") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||||
|  |             self._save_cookies_to_file(self._cookie_jar_path) | ||||||
|  |         except Exception as e: | ||||||
|  |             log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||||
|  |             log.warn(str(e)) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def get_folder_structure_from_heading_hierarchy(file_link: Tag, drop_h1: bool = False) -> PurePath: | ||||||
|  |         """ | ||||||
|  |         Retrieves the hierarchy of headings associated with the give file link and constructs a folder | ||||||
|  |         structure from them. | ||||||
|  |  | ||||||
|  |         <h1> level headings usually only appear once and serve as the page title, so they would introduce | ||||||
|  |         redundant nesting. To avoid this, <h1> headings are ignored via the drop_h1 parameter. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         def find_associated_headings(tag: Tag, level: int) -> PurePath: | ||||||
|  |             if level == 0 or (level == 1 and drop_h1): | ||||||
|  |                 return PurePath() | ||||||
|  |  | ||||||
|  |             level_heading = cast(Optional[Tag], tag.find_previous(name=f"h{level}")) | ||||||
|  |  | ||||||
|  |             if level_heading is None: | ||||||
|  |                 return find_associated_headings(tag, level - 1) | ||||||
|  |  | ||||||
|  |             folder_name = level_heading.get_text().strip() | ||||||
|  |             return find_associated_headings(level_heading, level - 1) / folder_name | ||||||
|  |  | ||||||
|  |         # start at level <h3> because paragraph-level headings are usually too granular for folder names | ||||||
|  |         return find_associated_headings(file_link, 3) | ||||||
|  |  | ||||||
|  |     def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: | ||||||
|  |         """ | ||||||
|  |         If available, retrieves the entity tag for a given path which was stored in the previous report. | ||||||
|  |         """ | ||||||
|  |         if not self._output_dir.prev_report: | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} | ||||||
|  |         return etags.get(str(path)) | ||||||
|  |  | ||||||
|  |     def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds an entity tag for a given path to the report's custom values. | ||||||
|  |         """ | ||||||
|  |         if not etag: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} | ||||||
|  |         etags[str(path)] = etag | ||||||
|  |         self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) | ||||||
|  |  | ||||||
|  |     async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]: | ||||||
|  |         """ | ||||||
|  |         Requests the ETag and Last-Modified headers of a resource via a HEAD request. | ||||||
|  |         If no entity tag / modification date can be obtained, the according value will be None. | ||||||
|  |         """ | ||||||
|  |         try: | ||||||
|  |             async with self.session.head(resource_url) as resp: | ||||||
|  |                 if resp.status != 200: | ||||||
|  |                     return None, None | ||||||
|  |  | ||||||
|  |                 etag_header = resp.headers.get("ETag") | ||||||
|  |                 last_modified_header = resp.headers.get("Last-Modified") | ||||||
|  |                 last_modified = None | ||||||
|  |  | ||||||
|  |                 if last_modified_header: | ||||||
|  |                     try: | ||||||
|  |                         # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives | ||||||
|  |                         datetime_format = "%a, %d %b %Y %H:%M:%S GMT" | ||||||
|  |                         last_modified = datetime.strptime(last_modified_header, datetime_format) | ||||||
|  |                     except ValueError: | ||||||
|  |                         # last_modified remains None | ||||||
|  |                         pass | ||||||
|  |  | ||||||
|  |                 return etag_header, last_modified | ||||||
|  |         except aiohttp.ClientError: | ||||||
|  |             return None, None | ||||||
|  |  | ||||||
|  |     async def run(self) -> None: | ||||||
|  |         self._request_count = 0 | ||||||
|  |         self._cookie_jar = aiohttp.CookieJar() | ||||||
|  |         self._load_cookies() | ||||||
|  |  | ||||||
|  |         async with aiohttp.ClientSession( | ||||||
|  |                 headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||||
|  |                 cookie_jar=self._cookie_jar, | ||||||
|  |                 connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||||
|  |                 timeout=ClientTimeout( | ||||||
|  |                     # 30 minutes. No download in the history of downloads was longer than 30 minutes. | ||||||
|  |                     # This is enough to transfer a 600 MB file over a 3 Mib/s connection. | ||||||
|  |                     # Allowing an arbitrary value could be annoying for overnight batch jobs | ||||||
|  |                     total=15 * 60, | ||||||
|  |                     connect=self._http_timeout, | ||||||
|  |                     sock_connect=self._http_timeout, | ||||||
|  |                     sock_read=self._http_timeout, | ||||||
|  |                 ), | ||||||
|  |                 # See https://github.com/aio-libs/aiohttp/issues/6626 | ||||||
|  |                 # Without this aiohttp will mangle the redirect header from Shibboleth, invalidating the | ||||||
|  |                 # passed signature. Shibboleth will not accept the broken signature and authentication will | ||||||
|  |                 # fail. | ||||||
|  |                 requote_redirect_url=False | ||||||
|  |         ) as session: | ||||||
|  |             self.session = session | ||||||
|  |             try: | ||||||
|  |                 await super().run() | ||||||
|  |             finally: | ||||||
|  |                 del self.session | ||||||
|  |         log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") | ||||||
|  |  | ||||||
|  |         # They are saved in authenticate, but a final save won't hurt | ||||||
|  |         self._save_cookies() | ||||||
							
								
								
									
										9
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								PFERD/crawl/ilias/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | |||||||
|  | from .kit_ilias_web_crawler import (IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, | ||||||
|  |                                     KitIliasWebCrawlerSection) | ||||||
|  |  | ||||||
|  | __all__ = [ | ||||||
|  |     "IliasWebCrawler", | ||||||
|  |     "IliasWebCrawlerSection", | ||||||
|  |     "KitIliasWebCrawler", | ||||||
|  |     "KitIliasWebCrawlerSection", | ||||||
|  | ] | ||||||
							
								
								
									
										40
									
								
								PFERD/crawl/ilias/async_helper.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								PFERD/crawl/ilias/async_helper.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | |||||||
|  | import asyncio | ||||||
|  | from typing import Any, Callable, Optional | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  |  | ||||||
|  | from ...logging import log | ||||||
|  | from ..crawler import AWrapped, CrawlError, CrawlWarning | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | ||||||
|  |     def decorator(f: AWrapped) -> AWrapped: | ||||||
|  |         async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||||
|  |             last_exception: Optional[BaseException] = None | ||||||
|  |             for round in range(attempts): | ||||||
|  |                 try: | ||||||
|  |                     return await f(*args, **kwargs) | ||||||
|  |                 except aiohttp.ContentTypeError:  # invalid content type | ||||||
|  |                     raise CrawlWarning("ILIAS returned an invalid content type") | ||||||
|  |                 except aiohttp.TooManyRedirects: | ||||||
|  |                     raise CrawlWarning("Got stuck in a redirect loop") | ||||||
|  |                 except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes | ||||||
|  |                     last_exception = e | ||||||
|  |                 except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc. | ||||||
|  |                     last_exception = e | ||||||
|  |                 except asyncio.exceptions.TimeoutError as e:  # explicit http timeouts in HttpCrawler | ||||||
|  |                     last_exception = e | ||||||
|  |                 log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") | ||||||
|  |                 log.explain(f"Last exception: {last_exception!r}") | ||||||
|  |  | ||||||
|  |             if last_exception: | ||||||
|  |                 message = f"Error in I/O Operation: {last_exception!r}" | ||||||
|  |                 if failure_is_error: | ||||||
|  |                     raise CrawlError(message) from last_exception | ||||||
|  |                 else: | ||||||
|  |                     raise CrawlWarning(message) from last_exception | ||||||
|  |             raise CrawlError("Impossible return in ilias _iorepeat") | ||||||
|  |  | ||||||
|  |         return wrapper  # type: ignore | ||||||
|  |  | ||||||
|  |     return decorator | ||||||
							
								
								
									
										292
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										292
									
								
								PFERD/crawl/ilias/file_templates.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,292 @@ | |||||||
|  | from enum import Enum | ||||||
|  | from typing import Optional, cast | ||||||
|  |  | ||||||
|  | import bs4 | ||||||
|  |  | ||||||
|  | from PFERD.utils import soupify | ||||||
|  |  | ||||||
|  | _link_template_plain = "{{link}}" | ||||||
|  | _link_template_fancy = """ | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html lang="en"> | ||||||
|  |     <head> | ||||||
|  |         <meta charset="UTF-8"> | ||||||
|  |         <title>ILIAS - Link: {{name}}</title> | ||||||
|  |         <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> | ||||||
|  |     </head> | ||||||
|  |  | ||||||
|  |     <style> | ||||||
|  |     * { | ||||||
|  |         box-sizing: border-box; | ||||||
|  |     } | ||||||
|  |     .center-flex { | ||||||
|  |         display: flex; | ||||||
|  |         align-items: center; | ||||||
|  |         justify-content: center; | ||||||
|  |     } | ||||||
|  |     body { | ||||||
|  |         padding: 0; | ||||||
|  |         margin: 0; | ||||||
|  |         background-color: #f0f0f0; | ||||||
|  |         font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; | ||||||
|  |         height: 100vh; | ||||||
|  |     } | ||||||
|  |     .row { | ||||||
|  |         background-color: white; | ||||||
|  |         min-width: 500px; | ||||||
|  |         max-width: 90vw; | ||||||
|  |         display: flex; | ||||||
|  |         padding: 1em; | ||||||
|  |     } | ||||||
|  |     .logo { | ||||||
|  |         flex: 0 1; | ||||||
|  |         margin-right: 1em; | ||||||
|  |         fill: #009682; | ||||||
|  |     } | ||||||
|  |     .tile { | ||||||
|  |         flex: 1 0; | ||||||
|  |         display: flex; | ||||||
|  |         flex-direction: column; | ||||||
|  |         justify-content: center; | ||||||
|  |     } | ||||||
|  |     .top-row { | ||||||
|  |         padding-bottom: 5px; | ||||||
|  |         font-size: 15px; | ||||||
|  |     } | ||||||
|  |     a { | ||||||
|  |         color: #009682; | ||||||
|  |         text-decoration: none; | ||||||
|  |     } | ||||||
|  |     a:hover { | ||||||
|  |         text-decoration: underline; | ||||||
|  |     } | ||||||
|  |     .bottom-row { | ||||||
|  |         font-size: 13px; | ||||||
|  |     } | ||||||
|  |     .menu-button { | ||||||
|  |         border: 1px solid black; | ||||||
|  |         margin-left: 4em; | ||||||
|  |         width: 25px; | ||||||
|  |         height: 25px; | ||||||
|  |         flex: 0 0 25px; | ||||||
|  |         background-color: #b3e0da; | ||||||
|  |         font-size: 13px; | ||||||
|  |         color: #222; | ||||||
|  |     } | ||||||
|  |     </style> | ||||||
|  |     <body class="center-flex"> | ||||||
|  |         <div class="row"> | ||||||
|  |             <div class="logo center-flex"> | ||||||
|  |                 <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> | ||||||
|  |                     <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/> | ||||||
|  |                 </svg> | ||||||
|  |             </div> | ||||||
|  |             <div class="tile"> | ||||||
|  |                 <div class="top-row"> | ||||||
|  |                     <a href="{{link}}">{{name}}</a> | ||||||
|  |                 </div> | ||||||
|  |                 <div class="bottom-row">{{description}}</div> | ||||||
|  |             </div> | ||||||
|  |             <div class="menu-button center-flex"> ⯆ </div> | ||||||
|  |         </div> | ||||||
|  |     </body> | ||||||
|  | </html> | ||||||
|  | """.strip()  # noqa: E501 line too long | ||||||
|  |  | ||||||
|  | _link_template_internet_shortcut = """ | ||||||
|  | [InternetShortcut] | ||||||
|  | URL={{link}} | ||||||
|  | """.strip() | ||||||
|  |  | ||||||
|  | _learning_module_template = """ | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html lang="en"> | ||||||
|  |     <head> | ||||||
|  |         <meta charset="UTF-8"> | ||||||
|  |         <title>{{name}}</title> | ||||||
|  |     </head> | ||||||
|  |  | ||||||
|  |     <style> | ||||||
|  |     * { | ||||||
|  |         box-sizing: border-box; | ||||||
|  |     } | ||||||
|  |     .center-flex { | ||||||
|  |         display: flex; | ||||||
|  |         align-items: center; | ||||||
|  |         justify-content: center; | ||||||
|  |     } | ||||||
|  |     .nav { | ||||||
|  |         display: flex; | ||||||
|  |         justify-content: space-between; | ||||||
|  |     } | ||||||
|  |     </style> | ||||||
|  |     <body class="center-flex"> | ||||||
|  | {{body}} | ||||||
|  |     </body> | ||||||
|  | </html> | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | _forum_thread_template = """ | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html lang="en"> | ||||||
|  |     <head> | ||||||
|  |         <meta charset="UTF-8"> | ||||||
|  |         <title>ILIAS - Forum: {{name}}</title> | ||||||
|  |         <style> | ||||||
|  |             * { | ||||||
|  |                 box-sizing: border-box; | ||||||
|  |             } | ||||||
|  |             body { | ||||||
|  |                 font-family: 'Open Sans', Verdana, Arial, Helvetica, sans-serif; | ||||||
|  |                 padding: 8px; | ||||||
|  |             } | ||||||
|  |             ul, ol, p { | ||||||
|  |                 margin: 1.2em 0; | ||||||
|  |             } | ||||||
|  |             p { | ||||||
|  |                 margin-top: 8px; | ||||||
|  |                 margin-bottom: 8px; | ||||||
|  |             } | ||||||
|  |             a { | ||||||
|  |                 color: #00876c; | ||||||
|  |                 text-decoration: none; | ||||||
|  |                 cursor: pointer; | ||||||
|  |             } | ||||||
|  |             a:hover { | ||||||
|  |                 text-decoration: underline; | ||||||
|  |             } | ||||||
|  |             body > p:first-child > span:first-child { | ||||||
|  |                 font-size: 1.6em; | ||||||
|  |             } | ||||||
|  |             body > p:first-child > span:first-child ~ span.default { | ||||||
|  |                 display: inline-block; | ||||||
|  |                 font-size: 1.2em; | ||||||
|  |                 padding-bottom: 8px; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostContent { | ||||||
|  |                 margin-top: 8px; | ||||||
|  |                 max-width: 64em; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostContent > *:first-child { | ||||||
|  |                 margin-top: 0px; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostTitle { | ||||||
|  |                 margin-top: 24px; | ||||||
|  |                 color: #00876c; | ||||||
|  |                 font-weight: bold; | ||||||
|  |             } | ||||||
|  |             #ilFrmPostList { | ||||||
|  |                 list-style: none; | ||||||
|  |                 padding-left: 0; | ||||||
|  |             } | ||||||
|  |             li.ilFrmPostRow { | ||||||
|  |                 padding: 3px 0 3px 3px; | ||||||
|  |                 margin-bottom: 24px; | ||||||
|  |                 border-left: 6px solid #dddddd; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostRow > div { | ||||||
|  |                 display: flex; | ||||||
|  |             } | ||||||
|  |             .ilFrmPostImage img { | ||||||
|  |                 margin: 0 !important; | ||||||
|  |                 padding: 6px 9px 9px 6px; | ||||||
|  |             } | ||||||
|  |             .ilUserIcon { | ||||||
|  |                 width: 115px; | ||||||
|  |             } | ||||||
|  |             .small { | ||||||
|  |                 text-decoration: none; | ||||||
|  |                 font-size: 0.75rem; | ||||||
|  |                 color: #6f6f6f; | ||||||
|  |             } | ||||||
|  |         </style> | ||||||
|  |     </head> | ||||||
|  |     <body> | ||||||
|  |     {{heading}} | ||||||
|  |     {{content}} | ||||||
|  |     </body> | ||||||
|  | </html> | ||||||
|  | """.strip()  # noqa: E501 line too long | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: | ||||||
|  |     # Seems to be comments, ignore those. | ||||||
|  |     for elem in body.select(".il-copg-mob-fullscreen-modal"): | ||||||
|  |         elem.decompose() | ||||||
|  |  | ||||||
|  |     nav_template = """ | ||||||
|  |         <div class="nav"> | ||||||
|  |             {{left}} | ||||||
|  |             {{right}} | ||||||
|  |         </div> | ||||||
|  |     """ | ||||||
|  |     if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): | ||||||
|  |         text = cast(bs4.Tag, body.select_one(".ilc_page_lnav_LeftNavigation")).get_text().strip() | ||||||
|  |         left = f'<a href="{prev}">{text}</a>' | ||||||
|  |     else: | ||||||
|  |         left = "<span></span>" | ||||||
|  |  | ||||||
|  |     if next and body.select_one(".ilc_page_rnav_RightNavigation"): | ||||||
|  |         text = cast(bs4.Tag, body.select_one(".ilc_page_rnav_RightNavigation")).get_text().strip() | ||||||
|  |         right = f'<a href="{next}">{text}</a>' | ||||||
|  |     else: | ||||||
|  |         right = "<span></span>" | ||||||
|  |  | ||||||
|  |     if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): | ||||||
|  |         top_nav.replace_with( | ||||||
|  |             soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): | ||||||
|  |         bot_nav.replace_with(soupify(nav_template.replace( | ||||||
|  |             "{{left}}", left).replace("{{right}}", right).encode()) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     body_str = cast(str, body.prettify()) | ||||||
|  |     return _learning_module_template.replace("{{body}}", body_str).replace("{{name}}", name) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Tag) -> str: | ||||||
|  |     if title := cast(Optional[bs4.Tag], heading.find(name="b")): | ||||||
|  |         title.wrap(bs4.Tag(name="a", attrs={"href": url})) | ||||||
|  |     return _forum_thread_template \ | ||||||
|  |         .replace("{{name}}", name) \ | ||||||
|  |         .replace("{{heading}}", cast(str, heading.prettify())) \ | ||||||
|  |         .replace("{{content}}", cast(str, content.prettify())) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Links(Enum): | ||||||
|  |     IGNORE = "ignore" | ||||||
|  |     PLAINTEXT = "plaintext" | ||||||
|  |     FANCY = "fancy" | ||||||
|  |     INTERNET_SHORTCUT = "internet-shortcut" | ||||||
|  |  | ||||||
|  |     def template(self) -> Optional[str]: | ||||||
|  |         if self == Links.FANCY: | ||||||
|  |             return _link_template_fancy | ||||||
|  |         elif self == Links.PLAINTEXT: | ||||||
|  |             return _link_template_plain | ||||||
|  |         elif self == Links.INTERNET_SHORTCUT: | ||||||
|  |             return _link_template_internet_shortcut | ||||||
|  |         elif self == Links.IGNORE: | ||||||
|  |             return None | ||||||
|  |         raise ValueError("Missing switch case") | ||||||
|  |  | ||||||
|  |     def extension(self) -> Optional[str]: | ||||||
|  |         if self == Links.FANCY: | ||||||
|  |             return ".html" | ||||||
|  |         elif self == Links.PLAINTEXT: | ||||||
|  |             return ".txt" | ||||||
|  |         elif self == Links.INTERNET_SHORTCUT: | ||||||
|  |             return ".url" | ||||||
|  |         elif self == Links.IGNORE: | ||||||
|  |             return None | ||||||
|  |         raise ValueError("Missing switch case") | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def from_string(string: str) -> "Links": | ||||||
|  |         try: | ||||||
|  |             return Links(string) | ||||||
|  |         except ValueError: | ||||||
|  |             raise ValueError("must be one of 'ignore', 'plaintext'," | ||||||
|  |                              " 'html', 'internet-shortcut'") | ||||||
							
								
								
									
										108
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								PFERD/crawl/ilias/ilias_html_cleaner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,108 @@ | |||||||
|  | from typing import cast | ||||||
|  |  | ||||||
|  | from bs4 import BeautifulSoup, Comment, Tag | ||||||
|  |  | ||||||
|  | _STYLE_TAG_CONTENT = """ | ||||||
|  |     .ilc_text_block_Information { | ||||||
|  |       background-color: #f5f7fa; | ||||||
|  |     } | ||||||
|  |     div.ilc_text_block_Standard { | ||||||
|  |       margin-bottom: 10px; | ||||||
|  |       margin-top: 10px; | ||||||
|  |     } | ||||||
|  |     span.ilc_text_inline_Strong { | ||||||
|  |       font-weight: bold; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     .row-flex { | ||||||
|  |       display: flex; | ||||||
|  |     } | ||||||
|  |     .row-flex-wrap { | ||||||
|  |       flex-wrap: wrap; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     .accordion-head { | ||||||
|  |       background-color: #f5f7fa; | ||||||
|  |       padding: 0.5rem 0; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     h3 { | ||||||
|  |       margin-top: 0.5rem; | ||||||
|  |       margin-bottom: 1rem; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     br.visible-break { | ||||||
|  |       margin-bottom: 1rem; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     article { | ||||||
|  |       margin: 0.5rem 0; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     img { | ||||||
|  |         background-color: white; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     body { | ||||||
|  |       padding: 1em; | ||||||
|  |       grid-template-columns: 1fr min(60rem, 90%) 1fr; | ||||||
|  |       line-height: 1.2; | ||||||
|  |     } | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | _ARTICLE_WORTHY_CLASSES = [ | ||||||
|  |     "ilc_text_block_Information", | ||||||
|  |     "ilc_section_Attention", | ||||||
|  |     "ilc_section_Link", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: | ||||||
|  |     head = soup.new_tag("head") | ||||||
|  |     soup.insert(0, head) | ||||||
|  |     # Force UTF-8 encoding | ||||||
|  |     head.append(soup.new_tag("meta", charset="utf-8")) | ||||||
|  |  | ||||||
|  |     # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> | ||||||
|  |     head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css")) | ||||||
|  |  | ||||||
|  |     # Basic style tags for compat | ||||||
|  |     style: Tag = soup.new_tag("style") | ||||||
|  |     style.append(_STYLE_TAG_CONTENT) | ||||||
|  |     head.append(style) | ||||||
|  |  | ||||||
|  |     return soup | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def clean(soup: BeautifulSoup) -> BeautifulSoup: | ||||||
|  |     for block in cast(list[Tag], soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES)): | ||||||
|  |         block.name = "article" | ||||||
|  |  | ||||||
|  |     for block in cast(list[Tag], soup.find_all("h3")): | ||||||
|  |         block.name = "div" | ||||||
|  |  | ||||||
|  |     for block in cast(list[Tag], soup.find_all("h1")): | ||||||
|  |         block.name = "h3" | ||||||
|  |  | ||||||
|  |     for block in cast(list[Tag], soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap")): | ||||||
|  |         block.name = "h3" | ||||||
|  |         block["class"] += ["accordion-head"]  # type: ignore | ||||||
|  |  | ||||||
|  |     for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): | ||||||
|  |         children = list(dummy.children) | ||||||
|  |         if not children: | ||||||
|  |             dummy.decompose() | ||||||
|  |         if len(children) > 1: | ||||||
|  |             continue | ||||||
|  |         if isinstance(type(children[0]), Comment): | ||||||
|  |             dummy.decompose() | ||||||
|  |  | ||||||
|  |     # Delete video figures, as they can not be internalized anyway | ||||||
|  |     for video in soup.select(".ilc_media_cont_MediaContainerHighlighted .ilPageVideo"): | ||||||
|  |         if figure := video.find_parent("figure"): | ||||||
|  |             figure.decompose() | ||||||
|  |  | ||||||
|  |     for hrule_imposter in cast(list[Tag], soup.find_all(class_="ilc_section_Separator")): | ||||||
|  |         hrule_imposter.insert(0, soup.new_tag("hr")) | ||||||
|  |  | ||||||
|  |     return soup | ||||||
							
								
								
									
										1061
									
								
								PFERD/crawl/ilias/ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1061
									
								
								PFERD/crawl/ilias/ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										1609
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1609
									
								
								PFERD/crawl/ilias/kit_ilias_html.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										37
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								PFERD/crawl/ilias/kit_ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | |||||||
|  | from typing import Dict, Literal | ||||||
|  |  | ||||||
|  | from ...auth import Authenticator | ||||||
|  | from ...config import Config | ||||||
|  | from .ilias_web_crawler import IliasWebCrawler, IliasWebCrawlerSection | ||||||
|  | from .shibboleth_login import ShibbolethLogin | ||||||
|  |  | ||||||
|  | _ILIAS_URL = "https://ilias.studium.kit.edu" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitShibbolethBackgroundLoginSuccessful: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIliasWebCrawlerSection(IliasWebCrawlerSection): | ||||||
|  |     def base_url(self) -> str: | ||||||
|  |         return _ILIAS_URL | ||||||
|  |  | ||||||
|  |     def login(self) -> Literal["shibboleth"]: | ||||||
|  |         return "shibboleth" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIliasWebCrawler(IliasWebCrawler): | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         name: str, | ||||||
|  |         section: KitIliasWebCrawlerSection, | ||||||
|  |         config: Config, | ||||||
|  |         authenticators: Dict[str, Authenticator], | ||||||
|  |     ): | ||||||
|  |         super().__init__(name, section, config, authenticators) | ||||||
|  |  | ||||||
|  |         self._shibboleth_login = ShibbolethLogin( | ||||||
|  |             _ILIAS_URL, | ||||||
|  |             self._auth, | ||||||
|  |             section.tfa_auth(authenticators), | ||||||
|  |         ) | ||||||
							
								
								
									
										129
									
								
								PFERD/crawl/ilias/shibboleth_login.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										129
									
								
								PFERD/crawl/ilias/shibboleth_login.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,129 @@ | |||||||
|  | from typing import Any, Optional, cast | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  | import yarl | ||||||
|  | from bs4 import BeautifulSoup, Tag | ||||||
|  |  | ||||||
|  | from ...auth import Authenticator, TfaAuthenticator | ||||||
|  | from ...logging import log | ||||||
|  | from ...utils import soupify | ||||||
|  | from ..crawler import CrawlError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ShibbolethLogin: | ||||||
|  |     """ | ||||||
|  |     Login via shibboleth system. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |         self, ilias_url: str, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator] | ||||||
|  |     ) -> None: | ||||||
|  |         self._ilias_url = ilias_url | ||||||
|  |         self._auth = authenticator | ||||||
|  |         self._tfa_auth = tfa_authenticator | ||||||
|  |  | ||||||
|  |     async def login(self, sess: aiohttp.ClientSession) -> None: | ||||||
|  |         """ | ||||||
|  |         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||||
|  |         cookies it receieves. | ||||||
|  |  | ||||||
|  |         This function should only be called whenever it is detected that you're | ||||||
|  |         not logged in. The cookies obtained should be good for a few minutes, | ||||||
|  |         maybe even an hour or two. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||||
|  |         # https://ilias.studium.kit.edu/login.php | ||||||
|  |         url = f"{self._ilias_url}/shib_login.php" | ||||||
|  |         async with sess.get(url) as response: | ||||||
|  |             shib_url = response.url | ||||||
|  |             if str(shib_url).startswith(self._ilias_url): | ||||||
|  |                 log.explain( | ||||||
|  |                     "ILIAS recognized our shib token and logged us in in the background, returning" | ||||||
|  |                 ) | ||||||
|  |                 return | ||||||
|  |             soup: BeautifulSoup = soupify(await response.read()) | ||||||
|  |  | ||||||
|  |         # Attempt to login using credentials, if necessary | ||||||
|  |         while not self._login_successful(soup): | ||||||
|  |             # Searching the form here so that this fails before asking for | ||||||
|  |             # credentials rather than after asking. | ||||||
|  |             form = cast(Tag, soup.find("form", {"method": "post"})) | ||||||
|  |             action = cast(str, form["action"]) | ||||||
|  |  | ||||||
|  |             # Equivalent: Enter credentials in | ||||||
|  |             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||||
|  |             url = str(shib_url.origin()) + action | ||||||
|  |             username, password = await self._auth.credentials() | ||||||
|  |             data = { | ||||||
|  |                 "_eventId_proceed": "", | ||||||
|  |                 "j_username": username, | ||||||
|  |                 "j_password": password, | ||||||
|  |                 "fudis_web_authn_assertion_input": "", | ||||||
|  |             } | ||||||
|  |             if csrf_token_input := form.find("input", {"name": "csrf_token"}): | ||||||
|  |                 data["csrf_token"] = csrf_token_input["value"]  # type: ignore | ||||||
|  |             soup = await _post(sess, url, data) | ||||||
|  |  | ||||||
|  |             if soup.find(id="attributeRelease"): | ||||||
|  |                 raise CrawlError( | ||||||
|  |                     "ILIAS Shibboleth entitlements changed! " | ||||||
|  |                     "Please log in once in your browser and review them" | ||||||
|  |                 ) | ||||||
|  |  | ||||||
|  |             if self._tfa_required(soup): | ||||||
|  |                 soup = await self._authenticate_tfa(sess, soup, shib_url) | ||||||
|  |  | ||||||
|  |             if not self._login_successful(soup): | ||||||
|  |                 self._auth.invalidate_credentials() | ||||||
|  |  | ||||||
|  |         # Equivalent: Being redirected via JS automatically | ||||||
|  |         # (or clicking "Continue" if you have JS disabled) | ||||||
|  |         relay_state = cast(Tag, soup.find("input", {"name": "RelayState"})) | ||||||
|  |         saml_response = cast(Tag, soup.find("input", {"name": "SAMLResponse"})) | ||||||
|  |         url = form = soup.find("form", {"method": "post"})["action"]  # type: ignore | ||||||
|  |         data = {  # using the info obtained in the while loop above | ||||||
|  |             "RelayState": cast(str, relay_state["value"]), | ||||||
|  |             "SAMLResponse": cast(str, saml_response["value"]), | ||||||
|  |         } | ||||||
|  |         await sess.post(cast(str, url), data=data) | ||||||
|  |  | ||||||
|  |     async def _authenticate_tfa( | ||||||
|  |         self, session: aiohttp.ClientSession, soup: BeautifulSoup, shib_url: yarl.URL | ||||||
|  |     ) -> BeautifulSoup: | ||||||
|  |         if not self._tfa_auth: | ||||||
|  |             self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") | ||||||
|  |  | ||||||
|  |         tfa_token = await self._tfa_auth.password() | ||||||
|  |  | ||||||
|  |         # Searching the form here so that this fails before asking for | ||||||
|  |         # credentials rather than after asking. | ||||||
|  |         form = cast(Tag, soup.find("form", {"method": "post"})) | ||||||
|  |         action = cast(str, form["action"]) | ||||||
|  |  | ||||||
|  |         # Equivalent: Enter token in | ||||||
|  |         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||||
|  |         url = str(shib_url.origin()) + action | ||||||
|  |         username, password = await self._auth.credentials() | ||||||
|  |         data = { | ||||||
|  |             "_eventId_proceed": "", | ||||||
|  |             "fudis_otp_input": tfa_token, | ||||||
|  |         } | ||||||
|  |         if csrf_token_input := form.find("input", {"name": "csrf_token"}): | ||||||
|  |             data["csrf_token"] = csrf_token_input["value"]  # type: ignore | ||||||
|  |         return await _post(session, url, data) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _login_successful(soup: BeautifulSoup) -> bool: | ||||||
|  |         relay_state = soup.find("input", {"name": "RelayState"}) | ||||||
|  |         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||||
|  |         return relay_state is not None and saml_response is not None | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _tfa_required(soup: BeautifulSoup) -> bool: | ||||||
|  |         return soup.find(id="fudiscr-form") is not None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: | ||||||
|  |     async with session.post(url, data=data) as response: | ||||||
|  |         return soupify(await response.read()) | ||||||
							
								
								
									
										188
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										188
									
								
								PFERD/crawl/kit_ipd_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,188 @@ | |||||||
|  | import os | ||||||
|  | import re | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from datetime import datetime | ||||||
|  | from pathlib import PurePath | ||||||
|  | from typing import Any, Awaitable, Generator, Iterable, List, Optional, Pattern, Tuple, Union, cast | ||||||
|  | from urllib.parse import urljoin | ||||||
|  |  | ||||||
|  | from bs4 import BeautifulSoup, Tag | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from ..logging import ProgressBar, log | ||||||
|  | from ..output_dir import FileSink | ||||||
|  | from ..utils import soupify | ||||||
|  | from .crawler import CrawlError | ||||||
|  | from .http_crawler import HttpCrawler, HttpCrawlerSection | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIpdCrawlerSection(HttpCrawlerSection): | ||||||
|  |     def target(self) -> str: | ||||||
|  |         target = self.s.get("target") | ||||||
|  |         if not target: | ||||||
|  |             self.missing_value("target") | ||||||
|  |  | ||||||
|  |         if not target.startswith("https://"): | ||||||
|  |             self.invalid_value("target", target, "Should be a URL") | ||||||
|  |  | ||||||
|  |         return target | ||||||
|  |  | ||||||
|  |     def link_regex(self) -> Pattern[str]: | ||||||
|  |         regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") | ||||||
|  |         return re.compile(regex) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class KitIpdFile: | ||||||
|  |     name: str | ||||||
|  |     url: str | ||||||
|  |  | ||||||
|  |     def explain(self) -> None: | ||||||
|  |         log.explain(f"File {self.name!r} (href={self.url!r})") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class KitIpdFolder: | ||||||
|  |     name: str | ||||||
|  |     entries: List[Union[KitIpdFile, "KitIpdFolder"]] | ||||||
|  |  | ||||||
|  |     def explain(self) -> None: | ||||||
|  |         log.explain_topic(f"Folder {self.name!r}") | ||||||
|  |         for entry in self.entries: | ||||||
|  |             entry.explain() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class KitIpdCrawler(HttpCrawler): | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: KitIpdCrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |     ): | ||||||
|  |         super().__init__(name, section, config) | ||||||
|  |         self._url = section.target() | ||||||
|  |         self._file_regex = section.link_regex() | ||||||
|  |  | ||||||
|  |     async def _run(self) -> None: | ||||||
|  |         maybe_cl = await self.crawl(PurePath(".")) | ||||||
|  |         if not maybe_cl: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         tasks: List[Awaitable[None]] = [] | ||||||
|  |  | ||||||
|  |         async with maybe_cl: | ||||||
|  |             for item in await self._fetch_items(): | ||||||
|  |                 item.explain() | ||||||
|  |                 if isinstance(item, KitIpdFolder): | ||||||
|  |                     tasks.append(self._crawl_folder(PurePath("."), item)) | ||||||
|  |                 else: | ||||||
|  |                     log.explain_topic(f"Orphan file {item.name!r} (href={item.url!r})") | ||||||
|  |                     log.explain("Attributing it to root folder") | ||||||
|  |                     # do this here to at least be sequential and not parallel (rate limiting is hard, as the | ||||||
|  |                     # crawl abstraction does not hold for these requests) | ||||||
|  |                     etag, mtime = await self._request_resource_version(item.url) | ||||||
|  |                     tasks.append(self._download_file(PurePath("."), item, etag, mtime)) | ||||||
|  |  | ||||||
|  |         await self.gather(tasks) | ||||||
|  |  | ||||||
|  |     async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None: | ||||||
|  |         path = parent / folder.name | ||||||
|  |         if not await self.crawl(path): | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         tasks = [] | ||||||
|  |         for entry in folder.entries: | ||||||
|  |             if isinstance(entry, KitIpdFolder): | ||||||
|  |                 tasks.append(self._crawl_folder(path, entry)) | ||||||
|  |             else: | ||||||
|  |                 # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl | ||||||
|  |                 # abstraction does not hold for these requests) | ||||||
|  |                 etag, mtime = await self._request_resource_version(entry.url) | ||||||
|  |                 tasks.append(self._download_file(path, entry, etag, mtime)) | ||||||
|  |  | ||||||
|  |         await self.gather(tasks) | ||||||
|  |  | ||||||
|  |     async def _download_file( | ||||||
|  |         self, | ||||||
|  |         parent: PurePath, | ||||||
|  |         file: KitIpdFile, | ||||||
|  |         etag: Optional[str], | ||||||
|  |         mtime: Optional[datetime] | ||||||
|  |     ) -> None: | ||||||
|  |         element_path = parent / file.name | ||||||
|  |  | ||||||
|  |         prev_etag = self._get_previous_etag_from_report(element_path) | ||||||
|  |         etag_differs = None if prev_etag is None else prev_etag != etag | ||||||
|  |  | ||||||
|  |         maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime) | ||||||
|  |         if not maybe_dl: | ||||||
|  |             # keep storing the known file's etag | ||||||
|  |             if prev_etag: | ||||||
|  |                 self._add_etag_to_report(element_path, prev_etag) | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         async with maybe_dl as (bar, sink): | ||||||
|  |             await self._stream_from_url(file.url, element_path, sink, bar) | ||||||
|  |  | ||||||
|  |     async def _fetch_items(self) -> Iterable[Union[KitIpdFile, KitIpdFolder]]: | ||||||
|  |         page, url = await self.get_page() | ||||||
|  |         elements: List[Tag] = self._find_file_links(page) | ||||||
|  |  | ||||||
|  |         # do not add unnecessary nesting for a single <h1> heading | ||||||
|  |         drop_h1: bool = len(page.find_all(name="h1")) <= 1 | ||||||
|  |  | ||||||
|  |         folder_tree: KitIpdFolder = KitIpdFolder(".", []) | ||||||
|  |         for element in elements: | ||||||
|  |             parent = HttpCrawler.get_folder_structure_from_heading_hierarchy(element, drop_h1) | ||||||
|  |             file = self._extract_file(element, url) | ||||||
|  |  | ||||||
|  |             current_folder: KitIpdFolder = folder_tree | ||||||
|  |             for folder_name in parent.parts: | ||||||
|  |                 # helps the type checker to verify that current_folder is indeed a folder | ||||||
|  |                 def subfolders() -> Generator[KitIpdFolder, Any, None]: | ||||||
|  |                     return (entry for entry in current_folder.entries if isinstance(entry, KitIpdFolder)) | ||||||
|  |  | ||||||
|  |                 if not any(entry.name == folder_name for entry in subfolders()): | ||||||
|  |                     current_folder.entries.append(KitIpdFolder(folder_name, [])) | ||||||
|  |                 current_folder = next(entry for entry in subfolders() if entry.name == folder_name) | ||||||
|  |  | ||||||
|  |             current_folder.entries.append(file) | ||||||
|  |  | ||||||
|  |         return folder_tree.entries | ||||||
|  |  | ||||||
|  |     def _extract_file(self, link: Tag, url: str) -> KitIpdFile: | ||||||
|  |         url = self._abs_url_from_link(url, link) | ||||||
|  |         name = os.path.basename(url) | ||||||
|  |         return KitIpdFile(name, url) | ||||||
|  |  | ||||||
|  |     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> list[Tag]: | ||||||
|  |         return cast(list[Tag], tag.find_all(name="a", attrs={"href": self._file_regex})) | ||||||
|  |  | ||||||
|  |     def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: | ||||||
|  |         return urljoin(url, cast(str, link_tag.get("href"))) | ||||||
|  |  | ||||||
|  |     async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: | ||||||
|  |         async with self.session.get(url, allow_redirects=False) as resp: | ||||||
|  |             if resp.status == 403: | ||||||
|  |                 raise CrawlError("Received a 403. Are you within the KIT network/VPN?") | ||||||
|  |             if resp.content_length: | ||||||
|  |                 bar.set_total(resp.content_length) | ||||||
|  |  | ||||||
|  |             async for data in resp.content.iter_chunked(1024): | ||||||
|  |                 sink.file.write(data) | ||||||
|  |                 bar.advance(len(data)) | ||||||
|  |  | ||||||
|  |             sink.done() | ||||||
|  |  | ||||||
|  |             self._add_etag_to_report(path, resp.headers.get("ETag")) | ||||||
|  |  | ||||||
|  |     async def get_page(self) -> Tuple[BeautifulSoup, str]: | ||||||
|  |         async with self.session.get(self._url) as request: | ||||||
|  |             # The web page for Algorithmen für Routenplanung contains some | ||||||
|  |             # weird comments that beautifulsoup doesn't parse correctly. This | ||||||
|  |             # hack enables those pages to be crawled, and should hopefully not | ||||||
|  |             # cause issues on other pages. | ||||||
|  |             content = (await request.read()).decode("utf-8") | ||||||
|  |             content = re.sub(r"<!--.*?-->", "", content) | ||||||
|  |             return soupify(content.encode("utf-8")), str(request.url) | ||||||
							
								
								
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								PFERD/crawl/local_crawler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,117 @@ | |||||||
|  | import asyncio | ||||||
|  | import datetime | ||||||
|  | import random | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Optional | ||||||
|  |  | ||||||
|  | from ..config import Config | ||||||
|  | from .crawler import Crawler, CrawlerSection, anoncritical | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class LocalCrawlerSection(CrawlerSection): | ||||||
|  |     def target(self) -> Path: | ||||||
|  |         value = self.s.get("target") | ||||||
|  |         if value is None: | ||||||
|  |             self.missing_value("target") | ||||||
|  |         return Path(value).expanduser() | ||||||
|  |  | ||||||
|  |     def crawl_delay(self) -> float: | ||||||
|  |         value = self.s.getfloat("crawl_delay", fallback=0.0) | ||||||
|  |         if value < 0: | ||||||
|  |             self.invalid_value("crawl_delay", value, | ||||||
|  |                                "Must not be negative") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def download_delay(self) -> float: | ||||||
|  |         value = self.s.getfloat("download_delay", fallback=0.0) | ||||||
|  |         if value < 0: | ||||||
|  |             self.invalid_value("download_delay", value, | ||||||
|  |                                "Must not be negative") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def download_speed(self) -> Optional[int]: | ||||||
|  |         value = self.s.getint("download_speed") | ||||||
|  |         if value is not None and value <= 0: | ||||||
|  |             self.invalid_value("download_speed", value, | ||||||
|  |                                "Must be greater than 0") | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class LocalCrawler(Crawler): | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             name: str, | ||||||
|  |             section: LocalCrawlerSection, | ||||||
|  |             config: Config, | ||||||
|  |     ): | ||||||
|  |         super().__init__(name, section, config) | ||||||
|  |  | ||||||
|  |         self._target = config.default_section.working_dir() / section.target() | ||||||
|  |         self._crawl_delay = section.crawl_delay() | ||||||
|  |         self._download_delay = section.download_delay() | ||||||
|  |         self._download_speed = section.download_speed() | ||||||
|  |  | ||||||
|  |         if self._download_speed: | ||||||
|  |             self._block_size = self._download_speed // 10 | ||||||
|  |         else: | ||||||
|  |             self._block_size = 1024**2  # 1 MiB | ||||||
|  |  | ||||||
|  |     async def _run(self) -> None: | ||||||
|  |         await self._crawl_path(self._target, PurePath()) | ||||||
|  |  | ||||||
|  |     @anoncritical | ||||||
|  |     async def _crawl_path(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         if path.is_dir(): | ||||||
|  |             await self._crawl_dir(path, pure) | ||||||
|  |         elif path.is_file(): | ||||||
|  |             await self._crawl_file(path, pure) | ||||||
|  |  | ||||||
|  |     async def _crawl_dir(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         cl = await self.crawl(pure) | ||||||
|  |         if not cl: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         tasks = [] | ||||||
|  |  | ||||||
|  |         async with cl: | ||||||
|  |             await asyncio.sleep(random.uniform( | ||||||
|  |                 0.5 * self._crawl_delay, | ||||||
|  |                 self._crawl_delay, | ||||||
|  |             )) | ||||||
|  |  | ||||||
|  |             for child in path.iterdir(): | ||||||
|  |                 pure_child = cl.path / child.name | ||||||
|  |                 tasks.append(self._crawl_path(child, pure_child)) | ||||||
|  |  | ||||||
|  |         await self.gather(tasks) | ||||||
|  |  | ||||||
|  |     async def _crawl_file(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         stat = path.stat() | ||||||
|  |         mtime = datetime.datetime.fromtimestamp(stat.st_mtime) | ||||||
|  |         dl = await self.download(pure, mtime=mtime) | ||||||
|  |         if not dl: | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         async with dl as (bar, sink): | ||||||
|  |             await asyncio.sleep(random.uniform( | ||||||
|  |                 0.5 * self._download_delay, | ||||||
|  |                 self._download_delay, | ||||||
|  |             )) | ||||||
|  |  | ||||||
|  |             bar.set_total(stat.st_size) | ||||||
|  |  | ||||||
|  |             with open(path, "rb") as f: | ||||||
|  |                 while True: | ||||||
|  |                     data = f.read(self._block_size) | ||||||
|  |                     if len(data) == 0: | ||||||
|  |                         break | ||||||
|  |  | ||||||
|  |                     sink.file.write(data) | ||||||
|  |                     bar.advance(len(data)) | ||||||
|  |  | ||||||
|  |                     if self._download_speed: | ||||||
|  |                         delay = self._block_size / self._download_speed | ||||||
|  |                         delay = random.uniform(0.8 * delay, 1.2 * delay) | ||||||
|  |                         await asyncio.sleep(delay) | ||||||
|  |  | ||||||
|  |                 sink.done() | ||||||
							
								
								
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								PFERD/deduplicator.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,85 @@ | |||||||
|  | from pathlib import PurePath | ||||||
|  | from typing import Iterator, Set | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def name_variants(path: PurePath) -> Iterator[PurePath]: | ||||||
|  |     separator = " " if " " in path.stem else "_" | ||||||
|  |     i = 1 | ||||||
|  |     while True: | ||||||
|  |         yield path.parent / f"{path.stem}{separator}{i}{path.suffix}" | ||||||
|  |         i += 1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Deduplicator: | ||||||
|  |     FORBIDDEN_CHARS = '<>:"/\\|?*' + "".join([chr(i) for i in range(0, 32)]) | ||||||
|  |     FORBIDDEN_NAMES = { | ||||||
|  |         "CON", "PRN", "AUX", "NUL", | ||||||
|  |         "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", | ||||||
|  |         "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     def __init__(self, windows_paths: bool) -> None: | ||||||
|  |         self._windows_paths = windows_paths | ||||||
|  |  | ||||||
|  |         self._known: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |     def _add(self, path: PurePath) -> None: | ||||||
|  |         self._known.add(path) | ||||||
|  |  | ||||||
|  |         # The last parent is just "." | ||||||
|  |         for parent in list(path.parents)[:-1]: | ||||||
|  |             self._known.add(parent) | ||||||
|  |  | ||||||
|  |     def _fixup_element(self, name: str) -> str: | ||||||
|  |         # For historical reasons, windows paths have some odd restrictions that | ||||||
|  |         # we're trying to avoid. See: | ||||||
|  |         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file | ||||||
|  |  | ||||||
|  |         for char in self.FORBIDDEN_CHARS: | ||||||
|  |             name = name.replace(char, "_") | ||||||
|  |  | ||||||
|  |         path = PurePath(name) | ||||||
|  |         if path.stem in self.FORBIDDEN_NAMES: | ||||||
|  |             name = f"{path.stem}_{path.suffix}" | ||||||
|  |  | ||||||
|  |         if name.endswith(" ") or name.endswith("."): | ||||||
|  |             name += "_" | ||||||
|  |  | ||||||
|  |         return name | ||||||
|  |  | ||||||
|  |     def _fixup_for_windows(self, path: PurePath) -> PurePath: | ||||||
|  |         new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts]) | ||||||
|  |         if new_path != path: | ||||||
|  |             log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") | ||||||
|  |         return new_path | ||||||
|  |  | ||||||
|  |     def fixup_path(self, path: PurePath) -> PurePath: | ||||||
|  |         """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise.""" | ||||||
|  |         if self._windows_paths: | ||||||
|  |             return self._fixup_for_windows(path) | ||||||
|  |         return path | ||||||
|  |  | ||||||
|  |     def mark(self, path: PurePath) -> PurePath: | ||||||
|  |         if self._windows_paths: | ||||||
|  |             path = self._fixup_for_windows(path) | ||||||
|  |  | ||||||
|  |         if path not in self._known: | ||||||
|  |             self._add(path) | ||||||
|  |             return path | ||||||
|  |  | ||||||
|  |         log.explain(f"Path {fmt_path(path)} is already taken, finding a new name") | ||||||
|  |  | ||||||
|  |         for variant in name_variants(path): | ||||||
|  |             if variant in self._known: | ||||||
|  |                 log.explain(f"Path {fmt_path(variant)} is taken as well") | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             log.explain(f"Found unused path {fmt_path(variant)}") | ||||||
|  |             self._add(variant) | ||||||
|  |             return variant | ||||||
|  |  | ||||||
|  |         # The "name_variants" iterator returns infinitely many paths | ||||||
|  |         raise RuntimeError("Unreachable") | ||||||
							
								
								
									
										60
									
								
								PFERD/ffm.py
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								PFERD/ffm.py
									
									
									
									
									
								
							| @@ -1,60 +0,0 @@ | |||||||
| # Fakultät für Mathematik (FfM) |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| import pathlib |  | ||||||
| import re |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
| __all__ = ["FfM"] |  | ||||||
| logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
| class FfM: |  | ||||||
|     BASE_URL = "http://www.math.kit.edu/" |  | ||||||
|     LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") |  | ||||||
|  |  | ||||||
|     def __init__(self, base_path): |  | ||||||
|         self.base_path = base_path |  | ||||||
|  |  | ||||||
|         self._session = requests.Session() |  | ||||||
|  |  | ||||||
|     def synchronize(self, urlpart, to_dir, transform=lambda x: x): |  | ||||||
|         logger.info(f"    Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") |  | ||||||
|  |  | ||||||
|         sync_path = pathlib.Path(self.base_path, to_dir) |  | ||||||
|  |  | ||||||
|         orga = Organizer(self.base_path, sync_path) |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|         self._crawl(orga, urlpart, transform) |  | ||||||
|  |  | ||||||
|         orga.clean_sync_dir() |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|     def _crawl(self, orga, urlpart, transform): |  | ||||||
|         url = self.BASE_URL + urlpart |  | ||||||
|         r = self._session.get(url) |  | ||||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|  |  | ||||||
|         for found in soup.find_all("a", href=self.LINK_RE): |  | ||||||
|             url = found["href"] |  | ||||||
|             filename = re.match(self.LINK_RE, url).group(1).replace("/", ".") |  | ||||||
|             logger.debug(f"Found file {filename} at {url}") |  | ||||||
|  |  | ||||||
|             old_path = pathlib.PurePath(filename) |  | ||||||
|             new_path = transform(old_path) |  | ||||||
|             if new_path is None: |  | ||||||
|                 continue |  | ||||||
|             logger.debug(f"Transformed from {old_path} to {new_path}") |  | ||||||
|  |  | ||||||
|             temp_path = orga.temp_file() |  | ||||||
|             self._download(url, temp_path) |  | ||||||
|             orga.add_file(temp_path, new_path) |  | ||||||
|  |  | ||||||
|     def _download(self, url, to_path): |  | ||||||
|         with self._session.get(url, stream=True) as r: |  | ||||||
|             stream_to_path(r, to_path) |  | ||||||
							
								
								
									
										109
									
								
								PFERD/ilias.py
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								PFERD/ilias.py
									
									
									
									
									
								
							| @@ -1,109 +0,0 @@ | |||||||
| # ILIAS |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| import pathlib |  | ||||||
| import re |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
|  |  | ||||||
| from .ilias_authenticators import ShibbolethAuthenticator |  | ||||||
| from .organizer import Organizer |  | ||||||
|  |  | ||||||
| __all__ = ["Ilias"] |  | ||||||
| logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
| class Ilias: |  | ||||||
|     FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") |  | ||||||
|     DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)") |  | ||||||
|  |  | ||||||
|     def __init__(self, base_path, cookie_file): |  | ||||||
|         self.base_path = base_path |  | ||||||
|  |  | ||||||
|         self._auth = ShibbolethAuthenticator(base_path / cookie_file) |  | ||||||
|  |  | ||||||
|     def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): |  | ||||||
|         logger.info(f"    Synchronizing ref_id {ref_id} to {to_dir} using the Ilias synchronizer.") |  | ||||||
|  |  | ||||||
|         sync_path = pathlib.Path(self.base_path, to_dir) |  | ||||||
|         orga = Organizer(self.base_path, sync_path) |  | ||||||
|  |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|         files = self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) |  | ||||||
|         self._download(orga, files, transform) |  | ||||||
|  |  | ||||||
|         orga.clean_sync_dir() |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|     def _crawl(self, dir_path, dir_id, filter_): |  | ||||||
|         soup = self._auth.get_webpage(dir_id) |  | ||||||
|  |  | ||||||
|         found_files = [] |  | ||||||
|  |  | ||||||
|         files = self._find_files(soup) |  | ||||||
|         for (name, file_id) in files: |  | ||||||
|             path = dir_path / name |  | ||||||
|             found_files.append((path, file_id)) |  | ||||||
|             logger.debug(f"Found file {path}") |  | ||||||
|  |  | ||||||
|         dirs = self._find_dirs(soup) |  | ||||||
|         for (name, ref_id) in dirs: |  | ||||||
|             path = dir_path / name |  | ||||||
|             logger.debug(f"Found dir {path}") |  | ||||||
|             if filter_(path): |  | ||||||
|                 logger.info(f"Searching {path}") |  | ||||||
|                 files = self._crawl(path, ref_id, filter_) |  | ||||||
|                 found_files.extend(files) |  | ||||||
|             else: |  | ||||||
|                 logger.info(f"Not searching {path}") |  | ||||||
|  |  | ||||||
|         return found_files |  | ||||||
|  |  | ||||||
|     def _download(self, orga, files, transform): |  | ||||||
|         for (path, file_id) in sorted(files): |  | ||||||
|             to_path = transform(path) |  | ||||||
|             if to_path is not None: |  | ||||||
|                 temp_path = orga.temp_file() |  | ||||||
|                 self._auth.download_file(file_id, temp_path) |  | ||||||
|                 orga.add_file(temp_path, to_path) |  | ||||||
|  |  | ||||||
|     def _find_files(self, soup): |  | ||||||
|         files = [] |  | ||||||
|         file_names = set() |  | ||||||
|  |  | ||||||
|         found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) |  | ||||||
|         for element in found: |  | ||||||
|             file_stem = element.string.strip().replace("/", ".") |  | ||||||
|             file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() |  | ||||||
|             file_id = re.search(self.FILE_RE, element.get("href")).group(1) |  | ||||||
|  |  | ||||||
|             file_name = f"{file_stem}.{file_type}" |  | ||||||
|             if file_name in file_names: |  | ||||||
|                 counter = 1 |  | ||||||
|                 while True: |  | ||||||
|                     file_name = f"{file_stem} (duplicate {counter}).{file_type}" |  | ||||||
|                     if file_name in file_names: |  | ||||||
|                         counter += 1 |  | ||||||
|                     else: |  | ||||||
|                         break |  | ||||||
|  |  | ||||||
|             files.append((file_name, file_id)) |  | ||||||
|             file_names.add(file_name) |  | ||||||
|  |  | ||||||
|         return files |  | ||||||
|  |  | ||||||
|     def _find_dirs(self, soup): |  | ||||||
|         dirs = [] |  | ||||||
|  |  | ||||||
|         found = soup.find_all("div", {"class": "alert", "role": "alert"}) |  | ||||||
|         if found: |  | ||||||
|             return [] |  | ||||||
|  |  | ||||||
|         found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) |  | ||||||
|         for element in found: |  | ||||||
|             dir_name = element.string.strip().replace("/", ".") |  | ||||||
|             ref_id = re.search(self.DIR_RE, element.get("href")).group(1) |  | ||||||
|             dir_id = f"fold_{ref_id}" |  | ||||||
|             dirs.append((dir_name, dir_id)) |  | ||||||
|  |  | ||||||
|         return dirs |  | ||||||
| @@ -1,176 +0,0 @@ | |||||||
| # This file is called IliasAuthenticators because there are multiple mechanisms |  | ||||||
| # for authenticating with Ilias (even though only the Shibboleth is currently |  | ||||||
| # implemented). Most of what the ShibbolethAuthenticator currently does is |  | ||||||
| # not Shibboleth specific; this mess would have to be cleaned up before |  | ||||||
| # actually implementing any other authentication method. |  | ||||||
| # |  | ||||||
| # I think the only other method is the password prompt when clicking the log in |  | ||||||
| # button. |  | ||||||
|  |  | ||||||
| import getpass |  | ||||||
| import http.cookiejar |  | ||||||
| import logging |  | ||||||
| import time |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .utils import ContentTypeException, stream_to_path |  | ||||||
|  |  | ||||||
| __all__ = ["ShibbolethAuthenticator"] |  | ||||||
| logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
| class ShibbolethAuthenticator: |  | ||||||
|     ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" |  | ||||||
|  |  | ||||||
|     ALLOWED_CONTENT_TYPES = [ |  | ||||||
|         "application/pdf", |  | ||||||
|         "application/zip", |  | ||||||
|         "application/msword", |  | ||||||
|         "text/xml", |  | ||||||
|         "text/plain", |  | ||||||
|         "image/jpeg", |  | ||||||
|         "image/png", |  | ||||||
|     ] |  | ||||||
|  |  | ||||||
|     def __init__(self, cookie_file) -> None: |  | ||||||
|         # Because LWPCookieJar insists on the path being str-like instead of |  | ||||||
|         # Path-like. |  | ||||||
|         cookie_file = str(cookie_file) |  | ||||||
|  |  | ||||||
|         cookies = http.cookiejar.LWPCookieJar(cookie_file) |  | ||||||
|         try: |  | ||||||
|             logger.info(f"Loading old cookies from {cookie_file!r}") |  | ||||||
|             cookies.load(ignore_discard=True) |  | ||||||
|         except (FileNotFoundError, http.cookiejar.LoadError): |  | ||||||
|             logger.warn(f"No (valid) cookie file found at {cookie_file!r}, ignoring...") |  | ||||||
|  |  | ||||||
|         self._session = requests.Session() |  | ||||||
|         self._session.cookies = cookies |  | ||||||
|  |  | ||||||
|     def _authenticate(self): |  | ||||||
|         """ |  | ||||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login |  | ||||||
|         cookies it receieves. |  | ||||||
|  |  | ||||||
|         This function should only be called whenever it is detected that you're |  | ||||||
|         not logged in. The cookies obtained should be good for a few minutes, |  | ||||||
|         maybe even an hour or two. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in |  | ||||||
|         # https://ilias.studium.kit.edu/login.php |  | ||||||
|         logger.debug("Begin authentication process with ILIAS") |  | ||||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" |  | ||||||
|         data = { |  | ||||||
|                 "sendLogin": "1", |  | ||||||
|                 "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", |  | ||||||
|                 "target": "/shib_login.php", |  | ||||||
|                 "home_organization_selection": "Mit KIT-Account anmelden", |  | ||||||
|         } |  | ||||||
|         r = self._session.post(url, data=data) |  | ||||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|  |  | ||||||
|         # Attempt to login using credentials, if necessary |  | ||||||
|         while not self._login_successful(soup): |  | ||||||
|             # Searching the form here so that this fails before asking for |  | ||||||
|             # credentials rather than after asking. |  | ||||||
|             form = soup.find("form", {"class": "form2", "method": "post"}) |  | ||||||
|             action = form["action"] |  | ||||||
|  |  | ||||||
|             print("Please enter Shibboleth credentials.") |  | ||||||
|             username = getpass.getpass(prompt="Username: ") |  | ||||||
|             password = getpass.getpass(prompt="Password: ") |  | ||||||
|  |  | ||||||
|             # Equivalent: Enter credentials in |  | ||||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO |  | ||||||
|             logger.debug("Attempt to log in to Shibboleth using credentials") |  | ||||||
|             url = "https://idp.scc.kit.edu" + action |  | ||||||
|             data = { |  | ||||||
|                     "_eventId_proceed": "", |  | ||||||
|                     "j_username": username, |  | ||||||
|                     "j_password": password, |  | ||||||
|             } |  | ||||||
|             r = self._session.post(url, data=data) |  | ||||||
|             soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|  |  | ||||||
|             if not self._login_successful(soup): |  | ||||||
|                 print("Incorrect credentials.") |  | ||||||
|  |  | ||||||
|         # Saving progress |  | ||||||
|         logger.info("Saving cookies (successfully authenticated with Shibboleth)") |  | ||||||
|         self._session.cookies.save(ignore_discard=True) |  | ||||||
|  |  | ||||||
|         # Equivalent: Being redirected via JS automatically |  | ||||||
|         # (or clicking "Continue" if you have JS disabled) |  | ||||||
|         logger.debug("Redirect back to ILIAS with login information") |  | ||||||
|         relay_state = soup.find("input", {"name": "RelayState"}) |  | ||||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) |  | ||||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" |  | ||||||
|         data = { # using the info obtained in the while loop above |  | ||||||
|             "RelayState": relay_state["value"], |  | ||||||
|             "SAMLResponse": saml_response["value"], |  | ||||||
|         } |  | ||||||
|         self._session.post(url, data=data) |  | ||||||
|  |  | ||||||
|         # Saving progress |  | ||||||
|         logger.info("Saving cookies (successfully authenticated with ILIAS)") |  | ||||||
|         self._session.cookies.save(ignore_discard=True) |  | ||||||
|  |  | ||||||
|     def _login_successful(self, soup): |  | ||||||
|         relay_state = soup.find("input", {"name": "RelayState"}) |  | ||||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) |  | ||||||
|         return relay_state is not None and saml_response is not None |  | ||||||
|  |  | ||||||
|     def _is_logged_in(self, soup): |  | ||||||
|         userlog = soup.find("li", {"id": "userlog"}) |  | ||||||
|         return userlog is not None |  | ||||||
|  |  | ||||||
|     def get_webpage(self, object_id): |  | ||||||
|         params = {"target": object_id} |  | ||||||
|  |  | ||||||
|         while True: |  | ||||||
|             logger.debug(f"Getting {self.ILIAS_GOTO} {params}") |  | ||||||
|             r = self._session.get(self.ILIAS_GOTO, params=params) |  | ||||||
|             soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|  |  | ||||||
|             if self._is_logged_in(soup): |  | ||||||
|                 return soup |  | ||||||
|             else: |  | ||||||
|                 logger.info("Not logged in, authenticating...") |  | ||||||
|                 self._authenticate() |  | ||||||
|  |  | ||||||
|     def get_webpage_by_refid(self, ref_id): |  | ||||||
|         return self.get_webpage(f"fold_{ref_id}") |  | ||||||
|  |  | ||||||
|     def _download(self, url, params, to_path): |  | ||||||
|         with self._session.get(url, params=params, stream=True) as r: |  | ||||||
|             content_type = r.headers["content-type"] |  | ||||||
|  |  | ||||||
|             if content_type in self.ALLOWED_CONTENT_TYPES: |  | ||||||
|                 # Yay, we got the file :) |  | ||||||
|                 stream_to_path(r, to_path) |  | ||||||
|                 return True |  | ||||||
|             elif content_type == "text/html": |  | ||||||
|                 # Dangit, we're probably not logged in. |  | ||||||
|                 soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|                 if self._is_logged_in(soup): |  | ||||||
|                     raise ContentTypeException( |  | ||||||
|                             "Attempting to download a web page, not a file") |  | ||||||
|                 return False |  | ||||||
|             else: |  | ||||||
|                 # What *did* we get? |  | ||||||
|                 raise ContentTypeException( |  | ||||||
|                         f"Unknown file of type {content_type}") |  | ||||||
|  |  | ||||||
|     def download_file(self, file_id, to_path): |  | ||||||
|         params = {"target": file_id} |  | ||||||
|  |  | ||||||
|         while True: |  | ||||||
|             success = self._download(self.ILIAS_GOTO, params, to_path) |  | ||||||
|  |  | ||||||
|             if success: |  | ||||||
|                 return |  | ||||||
|             else: |  | ||||||
|                 logger.info("Not logged in, authenticating...") |  | ||||||
|                 self._authenticate() |  | ||||||
							
								
								
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								PFERD/limiter.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | |||||||
|  | import asyncio | ||||||
|  | import time | ||||||
|  | from contextlib import asynccontextmanager | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from typing import AsyncIterator, Optional | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Slot: | ||||||
|  |     active: bool = False | ||||||
|  |     last_left: Optional[float] = None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Limiter: | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             task_limit: int, | ||||||
|  |             download_limit: int, | ||||||
|  |             task_delay: float | ||||||
|  |     ): | ||||||
|  |         if task_limit <= 0: | ||||||
|  |             raise ValueError("task limit must be at least 1") | ||||||
|  |         if download_limit <= 0: | ||||||
|  |             raise ValueError("download limit must be at least 1") | ||||||
|  |         if download_limit > task_limit: | ||||||
|  |             raise ValueError("download limit can't be greater than task limit") | ||||||
|  |         if task_delay < 0: | ||||||
|  |             raise ValueError("Task delay must not be negative") | ||||||
|  |  | ||||||
|  |         self._slots = [Slot() for _ in range(task_limit)] | ||||||
|  |         self._downloads = download_limit | ||||||
|  |         self._delay = task_delay | ||||||
|  |  | ||||||
|  |         self._condition = asyncio.Condition() | ||||||
|  |  | ||||||
|  |     def _acquire_slot(self) -> Optional[Slot]: | ||||||
|  |         for slot in self._slots: | ||||||
|  |             if not slot.active: | ||||||
|  |                 slot.active = True | ||||||
|  |                 return slot | ||||||
|  |  | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     async def _wait_for_slot_delay(self, slot: Slot) -> None: | ||||||
|  |         if slot.last_left is not None: | ||||||
|  |             delay = slot.last_left + self._delay - time.time() | ||||||
|  |             if delay > 0: | ||||||
|  |                 await asyncio.sleep(delay) | ||||||
|  |  | ||||||
|  |     def _release_slot(self, slot: Slot) -> None: | ||||||
|  |         slot.last_left = time.time() | ||||||
|  |         slot.active = False | ||||||
|  |  | ||||||
|  |     @asynccontextmanager | ||||||
|  |     async def limit_crawl(self) -> AsyncIterator[None]: | ||||||
|  |         slot: Slot | ||||||
|  |         async with self._condition: | ||||||
|  |             while True: | ||||||
|  |                 if found_slot := self._acquire_slot(): | ||||||
|  |                     slot = found_slot | ||||||
|  |                     break | ||||||
|  |                 await self._condition.wait() | ||||||
|  |  | ||||||
|  |         await self._wait_for_slot_delay(slot) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             yield | ||||||
|  |         finally: | ||||||
|  |             async with self._condition: | ||||||
|  |                 self._release_slot(slot) | ||||||
|  |                 self._condition.notify_all() | ||||||
|  |  | ||||||
|  |     @asynccontextmanager | ||||||
|  |     async def limit_download(self) -> AsyncIterator[None]: | ||||||
|  |         slot: Slot | ||||||
|  |         async with self._condition: | ||||||
|  |             while True: | ||||||
|  |                 if self._downloads <= 0: | ||||||
|  |                     await self._condition.wait() | ||||||
|  |                     continue | ||||||
|  |  | ||||||
|  |                 if found_slot := self._acquire_slot(): | ||||||
|  |                     slot = found_slot | ||||||
|  |                     self._downloads -= 1 | ||||||
|  |                     break | ||||||
|  |  | ||||||
|  |                 await self._condition.wait() | ||||||
|  |  | ||||||
|  |         await self._wait_for_slot_delay(slot) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             yield | ||||||
|  |         finally: | ||||||
|  |             async with self._condition: | ||||||
|  |                 self._release_slot(slot) | ||||||
|  |                 self._downloads += 1 | ||||||
|  |                 self._condition.notify_all() | ||||||
							
								
								
									
										290
									
								
								PFERD/logging.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								PFERD/logging.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,290 @@ | |||||||
|  | import asyncio | ||||||
|  | import sys | ||||||
|  | import traceback | ||||||
|  | from contextlib import AbstractContextManager, asynccontextmanager, contextmanager | ||||||
|  | from typing import AsyncIterator, Iterator, List, Optional | ||||||
|  |  | ||||||
|  | from rich.console import Console, Group | ||||||
|  | from rich.live import Live | ||||||
|  | from rich.markup import escape | ||||||
|  | from rich.panel import Panel | ||||||
|  | from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, | ||||||
|  |                            TransferSpeedColumn) | ||||||
|  | from rich.table import Column | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ProgressBar: | ||||||
|  |     def __init__(self, progress: Progress, taskid: TaskID): | ||||||
|  |         self._progress = progress | ||||||
|  |         self._taskid = taskid | ||||||
|  |  | ||||||
|  |     def advance(self, amount: float = 1) -> None: | ||||||
|  |         self._progress.advance(self._taskid, advance=amount) | ||||||
|  |  | ||||||
|  |     def set_total(self, total: float) -> None: | ||||||
|  |         self._progress.update(self._taskid, total=total) | ||||||
|  |         self._progress.start_task(self._taskid) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Log: | ||||||
|  |     STATUS_WIDTH = 11 | ||||||
|  |  | ||||||
|  |     def __init__(self) -> None: | ||||||
|  |         self.console = Console(highlight=False) | ||||||
|  |  | ||||||
|  |         self._crawl_progress = Progress( | ||||||
|  |             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||||
|  |             BarColumn(), | ||||||
|  |             TimeRemainingColumn(), | ||||||
|  |             expand=True, | ||||||
|  |         ) | ||||||
|  |         self._download_progress = Progress( | ||||||
|  |             TextColumn("{task.description}", table_column=Column(ratio=1)), | ||||||
|  |             TransferSpeedColumn(), | ||||||
|  |             DownloadColumn(), | ||||||
|  |             BarColumn(), | ||||||
|  |             TimeRemainingColumn(), | ||||||
|  |             expand=True, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         self._live = Live(console=self.console, transient=True) | ||||||
|  |         self._update_live() | ||||||
|  |  | ||||||
|  |         self._showing_progress = False | ||||||
|  |         self._progress_suspended = False | ||||||
|  |         self._lock = asyncio.Lock() | ||||||
|  |         self._lines: List[str] = [] | ||||||
|  |  | ||||||
|  |         # Whether different parts of the output are enabled or disabled | ||||||
|  |         self.output_explain = False | ||||||
|  |         self.output_status = True | ||||||
|  |         self.output_not_deleted = True | ||||||
|  |         self.output_report = True | ||||||
|  |  | ||||||
|  |     def _update_live(self) -> None: | ||||||
|  |         elements = [] | ||||||
|  |         if self._crawl_progress.task_ids: | ||||||
|  |             elements.append(self._crawl_progress) | ||||||
|  |         if self._download_progress.task_ids: | ||||||
|  |             elements.append(self._download_progress) | ||||||
|  |  | ||||||
|  |         group = Group(*elements) | ||||||
|  |         self._live.update(group) | ||||||
|  |  | ||||||
|  |     @contextmanager | ||||||
|  |     def show_progress(self) -> Iterator[None]: | ||||||
|  |         if self._showing_progress: | ||||||
|  |             raise RuntimeError("Calling 'show_progress' while already showing progress") | ||||||
|  |  | ||||||
|  |         self._showing_progress = True | ||||||
|  |         try: | ||||||
|  |             with self._live: | ||||||
|  |                 yield | ||||||
|  |         finally: | ||||||
|  |             self._showing_progress = False | ||||||
|  |  | ||||||
|  |     @asynccontextmanager | ||||||
|  |     async def exclusive_output(self) -> AsyncIterator[None]: | ||||||
|  |         if not self._showing_progress: | ||||||
|  |             raise RuntimeError("Calling 'exclusive_output' while not showing progress") | ||||||
|  |  | ||||||
|  |         async with self._lock: | ||||||
|  |             self._progress_suspended = True | ||||||
|  |             self._live.stop() | ||||||
|  |             try: | ||||||
|  |                 yield | ||||||
|  |             finally: | ||||||
|  |                 self._live.start() | ||||||
|  |                 self._progress_suspended = False | ||||||
|  |                 for line in self._lines: | ||||||
|  |                     self.print(line) | ||||||
|  |                 self._lines = [] | ||||||
|  |  | ||||||
|  |     def unlock(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Get rid of an exclusive output state. | ||||||
|  |  | ||||||
|  |         This function is meant to let PFERD print log messages after the event | ||||||
|  |         loop was forcibly stopped and if it will not be started up again. After | ||||||
|  |         this is called, it is not safe to use any functions except the logging | ||||||
|  |         functions (print, warn, ...). | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self._progress_suspended = False | ||||||
|  |         for line in self._lines: | ||||||
|  |             self.print(line) | ||||||
|  |  | ||||||
|  |     def print(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a normal message. Allows markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self._progress_suspended: | ||||||
|  |             self._lines.append(text) | ||||||
|  |         else: | ||||||
|  |             self.console.print(text) | ||||||
|  |  | ||||||
|  |     # TODO Print errors (and warnings?) to stderr | ||||||
|  |  | ||||||
|  |     def warn(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a warning message. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.print(f"[bold bright_red]Warning[/] {escape(text)}") | ||||||
|  |  | ||||||
|  |     def warn_contd(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print further lines of a warning message. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.print(f"{escape(text)}") | ||||||
|  |  | ||||||
|  |     def error(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print an error message. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") | ||||||
|  |  | ||||||
|  |     def error_contd(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print further lines of an error message. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.print(f"[red]{escape(text)}") | ||||||
|  |  | ||||||
|  |     def unexpected_exception(self) -> None: | ||||||
|  |         """ | ||||||
|  |         Call this in an "except" clause to log an unexpected exception. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         t, v, tb = sys.exc_info() | ||||||
|  |         if t is None or v is None or tb is None: | ||||||
|  |             # We're not currently handling an exception, so somebody probably | ||||||
|  |             # called this function where they shouldn't. | ||||||
|  |             self.error("Something unexpected happened") | ||||||
|  |             self.error_contd("") | ||||||
|  |             for line in traceback.format_stack(): | ||||||
|  |                 self.error_contd(line[:-1])  # Without the newline | ||||||
|  |             self.error_contd("") | ||||||
|  |         else: | ||||||
|  |             self.error("An unexpected exception occurred") | ||||||
|  |             self.error_contd("") | ||||||
|  |             self.error_contd(traceback.format_exc()) | ||||||
|  |  | ||||||
|  |         # Our print function doesn't take types other than strings, but the | ||||||
|  |         # underlying rich.print function does. This call is a special case | ||||||
|  |         # anyways, and we're calling it internally, so this should be fine. | ||||||
|  |         self.print(Panel.fit(""" | ||||||
|  | Please copy your program output and send it to the PFERD maintainers, either | ||||||
|  | directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | ||||||
|  |         """.strip()))  # type: ignore | ||||||
|  |  | ||||||
|  |     def explain_topic(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a top-level explain text. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_explain: | ||||||
|  |             self.print(f"[yellow]{escape(text)}") | ||||||
|  |  | ||||||
|  |     def explain(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print an indented explain text. Allows no markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_explain: | ||||||
|  |             self.print(f"  {escape(text)}") | ||||||
|  |  | ||||||
|  |     def status(self, style: str, action: str, text: str, suffix: str = "") -> None: | ||||||
|  |         """ | ||||||
|  |         Print a status update while crawling. Allows markup in the "style" | ||||||
|  |         argument which will be applied to the "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_status: | ||||||
|  |             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||||
|  |  | ||||||
|  |     def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: | ||||||
|  |         """ | ||||||
|  |         Print a message for a local only file that wasn't | ||||||
|  |         deleted while crawling. Allows markup in the "style" | ||||||
|  |         argument which will be applied to the "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_status and self.output_not_deleted: | ||||||
|  |             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||||
|  |  | ||||||
|  |     def report(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a report after crawling. Allows markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_report: | ||||||
|  |             self.print(text) | ||||||
|  |  | ||||||
|  |     def report_not_deleted(self, text: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Print a report for a local only file that wasn't deleted after crawling. Allows markup. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if self.output_report and self.output_not_deleted: | ||||||
|  |             self.print(text) | ||||||
|  |  | ||||||
|  |     @contextmanager | ||||||
|  |     def _bar( | ||||||
|  |             self, | ||||||
|  |             progress: Progress, | ||||||
|  |             description: str, | ||||||
|  |             total: Optional[float], | ||||||
|  |     ) -> Iterator[ProgressBar]: | ||||||
|  |         if total is None: | ||||||
|  |             # Indeterminate progress bar | ||||||
|  |             taskid = progress.add_task(description, start=False) | ||||||
|  |         else: | ||||||
|  |             taskid = progress.add_task(description, total=total) | ||||||
|  |         self._update_live() | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             yield ProgressBar(progress, taskid) | ||||||
|  |         finally: | ||||||
|  |             progress.remove_task(taskid) | ||||||
|  |             self._update_live() | ||||||
|  |  | ||||||
|  |     def crawl_bar( | ||||||
|  |             self, | ||||||
|  |             style: str, | ||||||
|  |             action: str, | ||||||
|  |             text: str, | ||||||
|  |             total: Optional[float] = None, | ||||||
|  |     ) -> AbstractContextManager[ProgressBar]: | ||||||
|  |         """ | ||||||
|  |         Allows markup in the "style" argument which will be applied to the | ||||||
|  |         "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |         description = f"{style}{action}[/] {text}" | ||||||
|  |         return self._bar(self._crawl_progress, description, total) | ||||||
|  |  | ||||||
|  |     def download_bar( | ||||||
|  |             self, | ||||||
|  |             style: str, | ||||||
|  |             action: str, | ||||||
|  |             text: str, | ||||||
|  |             total: Optional[float] = None, | ||||||
|  |     ) -> AbstractContextManager[ProgressBar]: | ||||||
|  |         """ | ||||||
|  |         Allows markup in the "style" argument which will be applied to the | ||||||
|  |         "action" string. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|  |         description = f"{style}{action}[/] {text}" | ||||||
|  |         return self._bar(self._download_progress, description, total) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | log = Log() | ||||||
							
								
								
									
										107
									
								
								PFERD/norbert.py
									
									
									
									
									
								
							
							
						
						
									
										107
									
								
								PFERD/norbert.py
									
									
									
									
									
								
							| @@ -1,107 +0,0 @@ | |||||||
| # Norberts Prog-Tuts |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| import pathlib |  | ||||||
| import re |  | ||||||
| import zipfile |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .utils import rename, stream_to_path |  | ||||||
|  |  | ||||||
| __all__ = ["Norbert"] |  | ||||||
| logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
| class Norbert: |  | ||||||
|     BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" |  | ||||||
|     LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") |  | ||||||
|  |  | ||||||
|     def __init__(self, base_path): |  | ||||||
|         self.base_path = base_path |  | ||||||
|  |  | ||||||
|         self._session = requests.Session() |  | ||||||
|  |  | ||||||
|     def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): |  | ||||||
|         logger.info(f"    Synchronizing to {to_dir} using the Norbert synchronizer.") |  | ||||||
|  |  | ||||||
|         sync_path = pathlib.Path(self.base_path, to_dir) |  | ||||||
|         orga = Organizer(self.base_path, sync_path) |  | ||||||
|  |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|         files = self._crawl() |  | ||||||
|         self._download(orga, files, transform, unzip) |  | ||||||
|  |  | ||||||
|         orga.clean_sync_dir() |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|     def _crawl(self): |  | ||||||
|         url = self.BASE_URL |  | ||||||
|         r = self._session.get(url) |  | ||||||
|  |  | ||||||
|         # replace undecodeable characters with a placeholder |  | ||||||
|         #text = r.raw.decode("utf-8", "replace") |  | ||||||
|  |  | ||||||
|         text = r.text |  | ||||||
|         soup = bs4.BeautifulSoup(text, "html.parser") |  | ||||||
|  |  | ||||||
|         files = [] |  | ||||||
|  |  | ||||||
|         for found in soup.find_all("a", href=self.LINK_RE): |  | ||||||
|             url = found["href"] |  | ||||||
|             full_url = self.BASE_URL + url |  | ||||||
|  |  | ||||||
|             filename = re.search(self.LINK_RE, url).group(1) |  | ||||||
|             path = pathlib.PurePath(filename) |  | ||||||
|  |  | ||||||
|             logger.debug(f"Found zip file {filename} at {full_url}") |  | ||||||
|             files.append((path, full_url)) |  | ||||||
|  |  | ||||||
|         return files |  | ||||||
|  |  | ||||||
|     def _download(self, orga, files, transform, unzip): |  | ||||||
|         for path, url in sorted(files): |  | ||||||
|             # Yes, we want the zip file contents |  | ||||||
|             if unzip(path): |  | ||||||
|                 logger.debug(f"Downloading and unzipping {path}") |  | ||||||
|                 zip_path = rename(path, path.stem) |  | ||||||
|  |  | ||||||
|                 # Download zip file |  | ||||||
|                 temp_file = orga.temp_file() |  | ||||||
|                 self._download_zip(url, temp_file) |  | ||||||
|  |  | ||||||
|                 # Search the zip file for files to extract |  | ||||||
|                 temp_dir = orga.temp_dir() |  | ||||||
|                 with zipfile.ZipFile(temp_file, "r") as zf: |  | ||||||
|                     for info in zf.infolist(): |  | ||||||
|                         # Only interested in the files themselves, the directory |  | ||||||
|                         # structure is created automatically by orga.add_file() |  | ||||||
|                         if info.is_dir(): |  | ||||||
|                             continue |  | ||||||
|  |  | ||||||
|                         file_path = zip_path / pathlib.PurePath(info.filename) |  | ||||||
|                         logger.debug(f"Found {info.filename} at path {file_path}") |  | ||||||
|  |  | ||||||
|                         new_path = transform(file_path) |  | ||||||
|                         if new_path is not None: |  | ||||||
|                             # Extract to temp file and add, the usual deal |  | ||||||
|                             temp_file = orga.temp_file() |  | ||||||
|                             extracted_path = zf.extract(info, temp_dir) |  | ||||||
|                             extracted_path = pathlib.Path(extracted_path) |  | ||||||
|                             orga.add_file(extracted_path, new_path) |  | ||||||
|  |  | ||||||
|             # No, we only want the zip file itself |  | ||||||
|             else: |  | ||||||
|                 logger.debug(f"Only downloading {path}") |  | ||||||
|  |  | ||||||
|                 new_path = transform(path) |  | ||||||
|                 if new_path is not None: |  | ||||||
|                     temp_file = orga.temp_file() |  | ||||||
|                     self._download_zip(url, temp_file) |  | ||||||
|                     orga.add_file(temp_file, new_path) |  | ||||||
|  |  | ||||||
|     def _download_zip(self, url, to_path): |  | ||||||
|         with self._session.get(url, stream=True) as r: |  | ||||||
|             stream_to_path(r, to_path) |  | ||||||
| @@ -1,150 +0,0 @@ | |||||||
| import filecmp |  | ||||||
| import logging |  | ||||||
| import pathlib |  | ||||||
| import shutil |  | ||||||
|  |  | ||||||
| from . import utils |  | ||||||
|  |  | ||||||
| __all__ = ["Organizer"] |  | ||||||
| logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
| class Organizer: |  | ||||||
|     def __init__(self, base_dir, sync_dir): |  | ||||||
|         """ |  | ||||||
|         base_dir - the .tmp directory will be created here |  | ||||||
|         sync_dir - synced files will be moved here |  | ||||||
|         Both are expected to be concrete pathlib paths. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._base_dir = base_dir |  | ||||||
|         self._sync_dir = sync_dir |  | ||||||
|  |  | ||||||
|         self._temp_dir = pathlib.Path(self._base_dir, ".tmp") |  | ||||||
|         self._temp_nr = 0 |  | ||||||
|  |  | ||||||
|         # check if base/sync dir exist? |  | ||||||
|  |  | ||||||
|         self._added_files = set() |  | ||||||
|  |  | ||||||
|     def clean_temp_dir(self): |  | ||||||
|         if self._temp_dir.exists(): |  | ||||||
|             shutil.rmtree(self._temp_dir) |  | ||||||
|         self._temp_dir.mkdir(exist_ok=True) |  | ||||||
|         logger.debug(f"Cleaned temp dir: {self._temp_dir}") |  | ||||||
|  |  | ||||||
|     def temp_dir(self): |  | ||||||
|         nr = self._temp_nr |  | ||||||
|         self._temp_nr += 1 |  | ||||||
|         temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() |  | ||||||
|         logger.debug(f"Produced new temp dir: {temp_dir}") |  | ||||||
|         return temp_dir |  | ||||||
|  |  | ||||||
|     def temp_file(self): |  | ||||||
|         # generate the path to a new temp file in base_path/.tmp/ |  | ||||||
|         # make sure no two paths are the same |  | ||||||
|         nr = self._temp_nr |  | ||||||
|         self._temp_nr += 1 |  | ||||||
|         temp_file =  pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve() |  | ||||||
|         logger.debug(f"Produced new temp file: {temp_file}") |  | ||||||
|         return temp_file |  | ||||||
|  |  | ||||||
|     def add_file(self, from_path, to_path): |  | ||||||
|         if not from_path.exists(): |  | ||||||
|             raise utils.FileNotFoundException(f"Could not add file at {from_path}") |  | ||||||
|  |  | ||||||
|         # check if sync_dir/to_path is inside sync_dir? |  | ||||||
|         to_path = pathlib.Path(self._sync_dir, to_path) |  | ||||||
|  |  | ||||||
|         if to_path.exists() and to_path.is_dir(): |  | ||||||
|             if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): |  | ||||||
|                 shutil.rmtree(to_path) |  | ||||||
|             else: |  | ||||||
|                 logger.warn(f"Could not add file {to_path}") |  | ||||||
|                 return |  | ||||||
|  |  | ||||||
|         if to_path.exists(): |  | ||||||
|             if filecmp.cmp(from_path, to_path, shallow=False): |  | ||||||
|                 logger.info(f"Ignored {to_path}") |  | ||||||
|  |  | ||||||
|                 # remember path for later reference |  | ||||||
|                 self._added_files.add(to_path.resolve()) |  | ||||||
|                 logger.debug(f"Added file {to_path.resolve()}") |  | ||||||
|  |  | ||||||
|                 # No further action needed, especially not overwriting symlinks... |  | ||||||
|                 return |  | ||||||
|             else: |  | ||||||
|                 logger.info(f"Different file at {to_path}") |  | ||||||
|         else: |  | ||||||
|             logger.info(f"New file at {to_path}") |  | ||||||
|  |  | ||||||
|         # copy the file from from_path to sync_dir/to_path |  | ||||||
|         # If the file being replaced was a symlink, the link itself is overwritten, |  | ||||||
|         # not the file the link points to. |  | ||||||
|         to_path.parent.mkdir(parents=True, exist_ok=True) |  | ||||||
|         from_path.replace(to_path) |  | ||||||
|         logger.debug(f"Moved {from_path} to {to_path}") |  | ||||||
|  |  | ||||||
|         # remember path for later reference, after the new file was written |  | ||||||
|         # This is necessary here because otherwise, resolve() would resolve the symlink too. |  | ||||||
|         self._added_files.add(to_path.resolve()) |  | ||||||
|         logger.debug(f"Added file {to_path.resolve()}") |  | ||||||
|  |  | ||||||
|     def clean_sync_dir(self): |  | ||||||
|         self._clean_dir(self._sync_dir, remove_parent=False) |  | ||||||
|         logger.debug(f"Cleaned sync dir: {self._sync_dir}") |  | ||||||
|  |  | ||||||
|     def _clean_dir(self, path, remove_parent=True): |  | ||||||
|         for child in sorted(path.iterdir()): |  | ||||||
|             logger.debug(f"Looking at {child.resolve()}") |  | ||||||
|             if child.is_dir(): |  | ||||||
|                 self._clean_dir(child, remove_parent=True) |  | ||||||
|             elif child.resolve() not in self._added_files: |  | ||||||
|                 if self._prompt_yes_no(f"Delete {child}?", default=False): |  | ||||||
|                     child.unlink() |  | ||||||
|                     logger.debug(f"Deleted {child}") |  | ||||||
|  |  | ||||||
|         if remove_parent: |  | ||||||
|             try: |  | ||||||
|                 path.rmdir() |  | ||||||
|             except OSError: # directory not empty |  | ||||||
|                 pass |  | ||||||
|  |  | ||||||
|     def _prompt_yes_no(self, question, default=None): |  | ||||||
|         if default is True: |  | ||||||
|             prompt = "[Y/n]" |  | ||||||
|         elif default is False: |  | ||||||
|             prompt = "[y/N]" |  | ||||||
|         else: |  | ||||||
|             prompt = "[y/n]" |  | ||||||
|  |  | ||||||
|         text = f"{question} {prompt} " |  | ||||||
|         WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'." |  | ||||||
|  |  | ||||||
|         while True: |  | ||||||
|             response = input(text).strip().lower() |  | ||||||
|             if response in {"yes", "ye", "y"}: |  | ||||||
|                 return True |  | ||||||
|             elif response in {"no", "n"}: |  | ||||||
|                 return False |  | ||||||
|             elif response == "": |  | ||||||
|                 if default is None: |  | ||||||
|                     print(WRONG_REPLY) |  | ||||||
|                 else: |  | ||||||
|                     return default |  | ||||||
|             else: |  | ||||||
|                 print(WRONG_REPLY) |  | ||||||
|  |  | ||||||
| # How to use: |  | ||||||
| # |  | ||||||
| # 1. Before downloading any files |  | ||||||
| # orga = Organizer("/home/user/sync/", "/home/user/sync/bookstore/") |  | ||||||
| # orga.clean_temp_dir() |  | ||||||
| # |  | ||||||
| # 2. Downloading a file |  | ||||||
| # tempfile = orga.temp_file() |  | ||||||
| # download_something_to(tempfile) |  | ||||||
| # orga.add_file(tempfile, "books/douglas_adams/hhgttg" |  | ||||||
| # |  | ||||||
| # 3. After downloading all files |  | ||||||
| # orga.clean_sync_dir() |  | ||||||
| # orga.clean_temp_dir() |  | ||||||
							
								
								
									
										545
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										545
									
								
								PFERD/output_dir.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,545 @@ | |||||||
|  | import filecmp | ||||||
|  | import json | ||||||
|  | import os | ||||||
|  | import random | ||||||
|  | import shutil | ||||||
|  | import string | ||||||
|  | from contextlib import contextmanager | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from datetime import datetime | ||||||
|  | from enum import Enum | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import BinaryIO, Iterator, Optional, Tuple | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .report import Report, ReportLoadError | ||||||
|  | from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no | ||||||
|  |  | ||||||
|  | SUFFIX_CHARS = string.ascii_lowercase + string.digits | ||||||
|  | SUFFIX_LENGTH = 6 | ||||||
|  | TRIES = 5 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OutputDirError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Redownload(Enum): | ||||||
|  |     NEVER = "never" | ||||||
|  |     NEVER_SMART = "never-smart" | ||||||
|  |     ALWAYS = "always" | ||||||
|  |     ALWAYS_SMART = "always-smart" | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def from_string(string: str) -> "Redownload": | ||||||
|  |         try: | ||||||
|  |             return Redownload(string) | ||||||
|  |         except ValueError: | ||||||
|  |             raise ValueError("must be one of 'never', 'never-smart'," | ||||||
|  |                              " 'always', 'always-smart'") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OnConflict(Enum): | ||||||
|  |     PROMPT = "prompt" | ||||||
|  |     LOCAL_FIRST = "local-first" | ||||||
|  |     REMOTE_FIRST = "remote-first" | ||||||
|  |     NO_DELETE = "no-delete" | ||||||
|  |     NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def from_string(string: str) -> "OnConflict": | ||||||
|  |         try: | ||||||
|  |             return OnConflict(string) | ||||||
|  |         except ValueError: | ||||||
|  |             raise ValueError("must be one of 'prompt', 'local-first'," | ||||||
|  |                              " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Heuristics: | ||||||
|  |     etag_differs: Optional[bool] | ||||||
|  |     mtime: Optional[datetime] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FileSink: | ||||||
|  |     def __init__(self, file: BinaryIO): | ||||||
|  |         self._file = file | ||||||
|  |         self._done = False | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def file(self) -> BinaryIO: | ||||||
|  |         return self._file | ||||||
|  |  | ||||||
|  |     def done(self) -> None: | ||||||
|  |         self._done = True | ||||||
|  |  | ||||||
|  |     def is_done(self) -> bool: | ||||||
|  |         return self._done | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class DownloadInfo: | ||||||
|  |     remote_path: PurePath | ||||||
|  |     path: PurePath | ||||||
|  |     local_path: Path | ||||||
|  |     tmp_path: Path | ||||||
|  |     heuristics: Heuristics | ||||||
|  |     on_conflict: OnConflict | ||||||
|  |     success: bool = False | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FileSinkToken(ReusableAsyncContextManager[FileSink]): | ||||||
|  |     # Whenever this class is entered, it creates a new temporary file and | ||||||
|  |     # returns a corresponding FileSink. | ||||||
|  |     # | ||||||
|  |     # When it is exited again, the file is closed and information about the | ||||||
|  |     # download handed back to the OutputDirectory. | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             output_dir: "OutputDirectory", | ||||||
|  |             remote_path: PurePath, | ||||||
|  |             path: PurePath, | ||||||
|  |             local_path: Path, | ||||||
|  |             heuristics: Heuristics, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |     ): | ||||||
|  |         super().__init__() | ||||||
|  |  | ||||||
|  |         self._output_dir = output_dir | ||||||
|  |         self._remote_path = remote_path | ||||||
|  |         self._path = path | ||||||
|  |         self._local_path = local_path | ||||||
|  |         self._heuristics = heuristics | ||||||
|  |         self._on_conflict = on_conflict | ||||||
|  |  | ||||||
|  |     async def _on_aenter(self) -> FileSink: | ||||||
|  |         tmp_path, file = await self._output_dir._create_tmp_file(self._local_path) | ||||||
|  |         sink = FileSink(file) | ||||||
|  |  | ||||||
|  |         async def after_download() -> None: | ||||||
|  |             await self._output_dir._after_download(DownloadInfo( | ||||||
|  |                 self._remote_path, | ||||||
|  |                 self._path, | ||||||
|  |                 self._local_path, | ||||||
|  |                 tmp_path, | ||||||
|  |                 self._heuristics, | ||||||
|  |                 self._on_conflict, | ||||||
|  |                 sink.is_done(), | ||||||
|  |             )) | ||||||
|  |  | ||||||
|  |         self._stack.push_async_callback(after_download) | ||||||
|  |         self._stack.enter_context(file) | ||||||
|  |  | ||||||
|  |         return sink | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OutputDirectory: | ||||||
|  |     REPORT_FILE = PurePath(".report") | ||||||
|  |  | ||||||
|  |     def __init__( | ||||||
|  |             self, | ||||||
|  |             root: Path, | ||||||
|  |             redownload: Redownload, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |     ): | ||||||
|  |         if os.name == "nt": | ||||||
|  |             # Windows limits the path length to 260 for some historical reason. | ||||||
|  |             # If you want longer paths, you will have to add the "\\?\" prefix | ||||||
|  |             # in front of your path. See: | ||||||
|  |             # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation | ||||||
|  |             self._root = Path("\\\\?\\" + str(root.absolute())) | ||||||
|  |         else: | ||||||
|  |             self._root = root | ||||||
|  |  | ||||||
|  |         self._redownload = redownload | ||||||
|  |         self._on_conflict = on_conflict | ||||||
|  |  | ||||||
|  |         self._report_path = self.resolve(self.REPORT_FILE) | ||||||
|  |         self._report = Report() | ||||||
|  |         self._prev_report: Optional[Report] = None | ||||||
|  |  | ||||||
|  |         self.register_reserved(self.REPORT_FILE) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def report(self) -> Report: | ||||||
|  |         return self._report | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def prev_report(self) -> Optional[Report]: | ||||||
|  |         return self._prev_report | ||||||
|  |  | ||||||
|  |     def prepare(self) -> None: | ||||||
|  |         log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             self._root.mkdir(parents=True, exist_ok=True) | ||||||
|  |         except OSError: | ||||||
|  |             raise OutputDirError("Failed to create base directory") | ||||||
|  |  | ||||||
|  |     def register_reserved(self, path: PurePath) -> None: | ||||||
|  |         self._report.mark_reserved(path) | ||||||
|  |  | ||||||
|  |     def resolve(self, path: PurePath) -> Path: | ||||||
|  |         """ | ||||||
|  |         May throw an OutputDirError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if ".." in path.parts: | ||||||
|  |             raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") | ||||||
|  |         if "." in path.parts: | ||||||
|  |             raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") | ||||||
|  |  | ||||||
|  |         return self._root / path | ||||||
|  |  | ||||||
|  |     def _should_download( | ||||||
|  |             self, | ||||||
|  |             local_path: Path, | ||||||
|  |             heuristics: Heuristics, | ||||||
|  |             redownload: Redownload, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |     ) -> bool: | ||||||
|  |         if not local_path.exists(): | ||||||
|  |             log.explain("No corresponding file present locally") | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         if on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             # Whatever is here, it will never be overwritten, so we don't need | ||||||
|  |             # to download the file. | ||||||
|  |             log.explain("Conflict resolution is 'local-first' and path exists") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         if not local_path.is_file(): | ||||||
|  |             # We know that there is *something* here that's not a file. | ||||||
|  |             log.explain("Non-file (probably a directory) present locally") | ||||||
|  |  | ||||||
|  |             # If on_conflict is LOCAL_FIRST or NO_DELETE, we know that it would | ||||||
|  |             # never be overwritten. It also doesn't have any relevant stats to | ||||||
|  |             # update. This means that we don't have to download the file | ||||||
|  |             # because we'd just always throw it away again. | ||||||
|  |             if on_conflict in {OnConflict.LOCAL_FIRST, OnConflict.NO_DELETE}: | ||||||
|  |                 log.explain(f"Conflict resolution is {on_conflict.value!r}") | ||||||
|  |                 return False | ||||||
|  |  | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         log.explain(f"Redownload policy is {redownload.value}") | ||||||
|  |  | ||||||
|  |         if redownload == Redownload.NEVER: | ||||||
|  |             return False | ||||||
|  |         elif redownload == Redownload.ALWAYS: | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         stat = local_path.stat() | ||||||
|  |  | ||||||
|  |         remote_newer = None | ||||||
|  |  | ||||||
|  |         # ETag should be a more reliable indicator than mtime, so we check it first | ||||||
|  |         if heuristics.etag_differs is not None: | ||||||
|  |             remote_newer = heuristics.etag_differs | ||||||
|  |             if remote_newer: | ||||||
|  |                 log.explain("Remote file's entity tag differs") | ||||||
|  |             else: | ||||||
|  |                 log.explain("Remote file's entity tag is the same") | ||||||
|  |  | ||||||
|  |         # Python on Windows crashes when faced with timestamps around the unix epoch | ||||||
|  |         if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): | ||||||
|  |             mtime = heuristics.mtime | ||||||
|  |             remote_newer = mtime.timestamp() > stat.st_mtime | ||||||
|  |             if remote_newer: | ||||||
|  |                 log.explain("Remote file seems to be newer") | ||||||
|  |             else: | ||||||
|  |                 log.explain("Remote file doesn't seem to be newer") | ||||||
|  |  | ||||||
|  |         if redownload == Redownload.NEVER_SMART: | ||||||
|  |             if remote_newer is None: | ||||||
|  |                 return False | ||||||
|  |             else: | ||||||
|  |                 return remote_newer | ||||||
|  |         elif redownload == Redownload.ALWAYS_SMART: | ||||||
|  |             if remote_newer is None: | ||||||
|  |                 return True | ||||||
|  |             else: | ||||||
|  |                 return remote_newer | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{redownload!r} is not a valid redownload policy") | ||||||
|  |  | ||||||
|  |     # The following conflict resolution functions all return False if the local | ||||||
|  |     # file(s) should be kept and True if they should be replaced by the remote | ||||||
|  |     # files. | ||||||
|  |  | ||||||
|  |     async def _conflict_lfrf( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Replace {fmt_path(path)} with remote file?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     async def _conflict_ldrf( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     async def _conflict_lfrd( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |             parent: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     async def _conflict_delete_lf( | ||||||
|  |             self, | ||||||
|  |             on_conflict: OnConflict, | ||||||
|  |             path: PurePath, | ||||||
|  |     ) -> bool: | ||||||
|  |         if on_conflict == OnConflict.PROMPT: | ||||||
|  |             async with log.exclusive_output(): | ||||||
|  |                 prompt = f"Delete {fmt_path(path)}?" | ||||||
|  |                 return await prompt_yes_no(prompt, default=False) | ||||||
|  |         elif on_conflict == OnConflict.LOCAL_FIRST: | ||||||
|  |             return False | ||||||
|  |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|  |             return True | ||||||
|  |         elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         # This should never be reached | ||||||
|  |         raise ValueError(f"{on_conflict!r} is not a valid conflict policy") | ||||||
|  |  | ||||||
|  |     def _tmp_path(self, base: Path, suffix_length: int) -> Path: | ||||||
|  |         prefix = "" if base.name.startswith(".") else "." | ||||||
|  |         suffix = "".join(random.choices(SUFFIX_CHARS, k=suffix_length)) | ||||||
|  |         name = f"{prefix}{base.name}.tmp.{suffix}" | ||||||
|  |         return base.parent / name | ||||||
|  |  | ||||||
|  |     async def _create_tmp_file( | ||||||
|  |             self, | ||||||
|  |             local_path: Path, | ||||||
|  |     ) -> Tuple[Path, BinaryIO]: | ||||||
|  |         """ | ||||||
|  |         May raise an OutputDirError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # Create tmp file | ||||||
|  |         for attempt in range(TRIES): | ||||||
|  |             suffix_length = SUFFIX_LENGTH + 2 * attempt | ||||||
|  |             tmp_path = self._tmp_path(local_path, suffix_length) | ||||||
|  |             try: | ||||||
|  |                 return tmp_path, open(tmp_path, "xb") | ||||||
|  |             except FileExistsError: | ||||||
|  |                 pass  # Try again | ||||||
|  |  | ||||||
|  |         raise OutputDirError("Failed to create temporary file") | ||||||
|  |  | ||||||
|  |     def should_try_download( | ||||||
|  |         self, | ||||||
|  |         path: PurePath, | ||||||
|  |         *, | ||||||
|  |         etag_differs: Optional[bool] = None, | ||||||
|  |         mtime: Optional[datetime] = None, | ||||||
|  |         redownload: Optional[Redownload] = None, | ||||||
|  |         on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> bool: | ||||||
|  |         heuristics = Heuristics(etag_differs, mtime) | ||||||
|  |         redownload = self._redownload if redownload is None else redownload | ||||||
|  |         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||||
|  |         local_path = self.resolve(path) | ||||||
|  |  | ||||||
|  |         return self._should_download(local_path, heuristics, redownload, on_conflict) | ||||||
|  |  | ||||||
|  |     async def download( | ||||||
|  |             self, | ||||||
|  |             remote_path: PurePath, | ||||||
|  |             path: PurePath, | ||||||
|  |             *, | ||||||
|  |             etag_differs: Optional[bool] = None, | ||||||
|  |             mtime: Optional[datetime] = None, | ||||||
|  |             redownload: Optional[Redownload] = None, | ||||||
|  |             on_conflict: Optional[OnConflict] = None, | ||||||
|  |     ) -> Optional[FileSinkToken]: | ||||||
|  |         """ | ||||||
|  |         May throw an OutputDirError, a MarkDuplicateError or a | ||||||
|  |         MarkConflictError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         heuristics = Heuristics(etag_differs, mtime) | ||||||
|  |         redownload = self._redownload if redownload is None else redownload | ||||||
|  |         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||||
|  |         local_path = self.resolve(path) | ||||||
|  |  | ||||||
|  |         self._report.mark(path) | ||||||
|  |  | ||||||
|  |         if not self._should_download(local_path, heuristics, redownload, on_conflict): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         # Detect and solve local-dir-remote-file conflict | ||||||
|  |         if local_path.is_dir(): | ||||||
|  |             log.explain("Conflict: There's a directory in place of the local file") | ||||||
|  |             if await self._conflict_ldrf(on_conflict, path): | ||||||
|  |                 log.explain("Result: Delete the obstructing directory") | ||||||
|  |                 shutil.rmtree(local_path) | ||||||
|  |             else: | ||||||
|  |                 log.explain("Result: Keep the obstructing directory") | ||||||
|  |                 return None | ||||||
|  |  | ||||||
|  |         # Detect and solve local-file-remote-dir conflict | ||||||
|  |         for parent in path.parents: | ||||||
|  |             local_parent = self.resolve(parent) | ||||||
|  |             if local_parent.exists() and not local_parent.is_dir(): | ||||||
|  |                 log.explain("Conflict: One of the local file's parents is a file") | ||||||
|  |                 if await self._conflict_lfrd(on_conflict, path, parent): | ||||||
|  |                     log.explain("Result: Delete the obstructing file") | ||||||
|  |                     local_parent.unlink() | ||||||
|  |                     break | ||||||
|  |                 else: | ||||||
|  |                     log.explain("Result: Keep the obstructing file") | ||||||
|  |                     return None | ||||||
|  |  | ||||||
|  |         # Ensure parent directory exists | ||||||
|  |         local_path.parent.mkdir(parents=True, exist_ok=True) | ||||||
|  |  | ||||||
|  |         return FileSinkToken(self, remote_path, path, local_path, heuristics, on_conflict) | ||||||
|  |  | ||||||
|  |     def _update_metadata(self, info: DownloadInfo) -> None: | ||||||
|  |         if mtime := info.heuristics.mtime: | ||||||
|  |             mtimestamp = mtime.timestamp() | ||||||
|  |             os.utime(info.local_path, times=(mtimestamp, mtimestamp)) | ||||||
|  |  | ||||||
|  |     @contextmanager | ||||||
|  |     def _ensure_deleted(self, path: Path) -> Iterator[None]: | ||||||
|  |         try: | ||||||
|  |             yield | ||||||
|  |         finally: | ||||||
|  |             path.unlink(missing_ok=True) | ||||||
|  |  | ||||||
|  |     async def _after_download(self, info: DownloadInfo) -> None: | ||||||
|  |         with self._ensure_deleted(info.tmp_path): | ||||||
|  |             log.status("[bold cyan]", "Downloaded", fmt_path(info.remote_path)) | ||||||
|  |             log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") | ||||||
|  |  | ||||||
|  |             changed = False | ||||||
|  |  | ||||||
|  |             if not info.success: | ||||||
|  |                 log.explain("Download unsuccessful, aborting") | ||||||
|  |                 return | ||||||
|  |  | ||||||
|  |             # Solve conflicts arising from existing local file | ||||||
|  |             if info.local_path.exists(): | ||||||
|  |                 changed = True | ||||||
|  |  | ||||||
|  |                 if filecmp.cmp(info.local_path, info.tmp_path): | ||||||
|  |                     log.explain("Contents identical with existing file") | ||||||
|  |                     log.explain("Updating metadata of existing file") | ||||||
|  |                     self._update_metadata(info) | ||||||
|  |                     return | ||||||
|  |  | ||||||
|  |                 log.explain("Conflict: The local and remote versions differ") | ||||||
|  |                 if await self._conflict_lfrf(info.on_conflict, info.path): | ||||||
|  |                     log.explain("Result: Replacing local with remote version") | ||||||
|  |                 else: | ||||||
|  |                     log.explain("Result: Keeping local version") | ||||||
|  |                     return | ||||||
|  |  | ||||||
|  |             info.tmp_path.replace(info.local_path) | ||||||
|  |             log.explain("Updating file metadata") | ||||||
|  |             self._update_metadata(info) | ||||||
|  |  | ||||||
|  |             if changed: | ||||||
|  |                 log.status("[bold bright_yellow]", "Changed", fmt_path(info.path)) | ||||||
|  |                 self._report.change_file(info.path) | ||||||
|  |             else: | ||||||
|  |                 log.status("[bold bright_green]", "Added", fmt_path(info.path)) | ||||||
|  |                 self._report.add_file(info.path) | ||||||
|  |  | ||||||
|  |     async def cleanup(self) -> None: | ||||||
|  |         await self._cleanup_dir(self._root, PurePath(), delete_self=False) | ||||||
|  |  | ||||||
|  |     async def _cleanup(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         if path.is_dir(): | ||||||
|  |             await self._cleanup_dir(path, pure) | ||||||
|  |         elif path.is_file(): | ||||||
|  |             await self._cleanup_file(path, pure) | ||||||
|  |  | ||||||
|  |     async def _cleanup_dir(self, path: Path, pure: PurePath, delete_self: bool = True) -> None: | ||||||
|  |         for child in sorted(path.iterdir()): | ||||||
|  |             pure_child = pure / child.name | ||||||
|  |             await self._cleanup(child, pure_child) | ||||||
|  |  | ||||||
|  |         if delete_self: | ||||||
|  |             try: | ||||||
|  |                 path.rmdir() | ||||||
|  |             except OSError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |     async def _cleanup_file(self, path: Path, pure: PurePath) -> None: | ||||||
|  |         if self._report.is_marked(pure): | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         if await self._conflict_delete_lf(self._on_conflict, pure): | ||||||
|  |             try: | ||||||
|  |                 path.unlink() | ||||||
|  |                 log.status("[bold bright_magenta]", "Deleted", fmt_path(pure)) | ||||||
|  |                 self._report.delete_file(pure) | ||||||
|  |             except OSError: | ||||||
|  |                 pass | ||||||
|  |         else: | ||||||
|  |             log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) | ||||||
|  |             self._report.not_delete_file(pure) | ||||||
|  |  | ||||||
|  |     def load_prev_report(self) -> None: | ||||||
|  |         log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") | ||||||
|  |         try: | ||||||
|  |             self._prev_report = Report.load(self._report_path) | ||||||
|  |             log.explain("Loaded report successfully") | ||||||
|  |         except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e: | ||||||
|  |             log.explain("Failed to load report") | ||||||
|  |             log.explain(str(e)) | ||||||
|  |  | ||||||
|  |     def store_report(self) -> None: | ||||||
|  |         log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}") | ||||||
|  |         try: | ||||||
|  |             self._report.store(self._report_path) | ||||||
|  |             log.explain("Stored report successfully") | ||||||
|  |         except OSError as e: | ||||||
|  |             log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}") | ||||||
|  |             log.warn_contd(str(e)) | ||||||
							
								
								
									
										199
									
								
								PFERD/pferd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										199
									
								
								PFERD/pferd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,199 @@ | |||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Dict, List, Optional | ||||||
|  |  | ||||||
|  | from rich.markup import escape | ||||||
|  |  | ||||||
|  | from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection | ||||||
|  | from .config import Config, ConfigOptionError | ||||||
|  | from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PferdLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Pferd: | ||||||
|  |     def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]): | ||||||
|  |         """ | ||||||
|  |         May throw PferdLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self._config = config | ||||||
|  |         self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips) | ||||||
|  |  | ||||||
|  |         self._authenticators: Dict[str, Authenticator] = {} | ||||||
|  |         self._crawlers: Dict[str, Crawler] = {} | ||||||
|  |  | ||||||
|  |     def _find_config_crawlers(self, config: Config) -> List[str]: | ||||||
|  |         crawl_sections = [] | ||||||
|  |  | ||||||
|  |         for name, section in config.crawl_sections(): | ||||||
|  |             if CrawlerSection(section).skip(): | ||||||
|  |                 log.explain(f"Skipping {name!r}") | ||||||
|  |             else: | ||||||
|  |                 crawl_sections.append(name) | ||||||
|  |  | ||||||
|  |         return crawl_sections | ||||||
|  |  | ||||||
|  |     def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: | ||||||
|  |         if len(cli_crawlers) != len(set(cli_crawlers)): | ||||||
|  |             raise PferdLoadError("Some crawlers were selected multiple times") | ||||||
|  |  | ||||||
|  |         crawl_sections = [name for name, _ in config.crawl_sections()] | ||||||
|  |  | ||||||
|  |         crawlers_to_run = []  # With crawl: prefix | ||||||
|  |         unknown_names = []  # Without crawl: prefix | ||||||
|  |  | ||||||
|  |         for name in cli_crawlers: | ||||||
|  |             section_name = f"crawl:{name}" | ||||||
|  |             if section_name in crawl_sections: | ||||||
|  |                 log.explain(f"Crawler section named {section_name!r} exists") | ||||||
|  |                 crawlers_to_run.append(section_name) | ||||||
|  |             else: | ||||||
|  |                 log.explain(f"There's no crawler section named {section_name!r}") | ||||||
|  |                 unknown_names.append(name) | ||||||
|  |  | ||||||
|  |         if unknown_names: | ||||||
|  |             if len(unknown_names) == 1: | ||||||
|  |                 [name] = unknown_names | ||||||
|  |                 raise PferdLoadError(f"There is no crawler named {name!r}") | ||||||
|  |             else: | ||||||
|  |                 names_str = ", ".join(repr(name) for name in unknown_names) | ||||||
|  |                 raise PferdLoadError(f"There are no crawlers named {names_str}") | ||||||
|  |  | ||||||
|  |         return crawlers_to_run | ||||||
|  |  | ||||||
|  |     def _find_crawlers_to_run( | ||||||
|  |             self, | ||||||
|  |             config: Config, | ||||||
|  |             cli_crawlers: Optional[List[str]], | ||||||
|  |             cli_skips: Optional[List[str]], | ||||||
|  |     ) -> List[str]: | ||||||
|  |         log.explain_topic("Deciding which crawlers to run") | ||||||
|  |  | ||||||
|  |         crawlers: List[str] | ||||||
|  |         if cli_crawlers is None: | ||||||
|  |             log.explain("No crawlers specified on CLI") | ||||||
|  |             log.explain("Running crawlers specified in config") | ||||||
|  |             crawlers = self._find_config_crawlers(config) | ||||||
|  |         else: | ||||||
|  |             log.explain("Crawlers specified on CLI") | ||||||
|  |             crawlers = self._find_cli_crawlers(config, cli_crawlers) | ||||||
|  |  | ||||||
|  |         skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set() | ||||||
|  |         for crawler in crawlers: | ||||||
|  |             if crawler in skips: | ||||||
|  |                 log.explain(f"Skipping crawler {crawler!r}") | ||||||
|  |         crawlers = [crawler for crawler in crawlers if crawler not in skips] | ||||||
|  |  | ||||||
|  |         return crawlers | ||||||
|  |  | ||||||
|  |     def _load_authenticators(self) -> None: | ||||||
|  |         for name, section in self._config.auth_sections(): | ||||||
|  |             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||||
|  |  | ||||||
|  |             auth_type = AuthSection(section).type() | ||||||
|  |             authenticator_constructor = AUTHENTICATORS.get(auth_type) | ||||||
|  |             if authenticator_constructor is None: | ||||||
|  |                 raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") | ||||||
|  |  | ||||||
|  |             authenticator = authenticator_constructor(name, section, self._config) | ||||||
|  |             self._authenticators[name] = authenticator | ||||||
|  |  | ||||||
|  |     def _load_crawlers(self) -> None: | ||||||
|  |         # Cookie sharing | ||||||
|  |         kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} | ||||||
|  |  | ||||||
|  |         for name, section in self._config.crawl_sections(): | ||||||
|  |             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") | ||||||
|  |  | ||||||
|  |             crawl_type = CrawlerSection(section).type() | ||||||
|  |             crawler_constructor = CRAWLERS.get(crawl_type) | ||||||
|  |             if crawler_constructor is None: | ||||||
|  |                 raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") | ||||||
|  |  | ||||||
|  |             crawler = crawler_constructor(name, section, self._config, self._authenticators) | ||||||
|  |             self._crawlers[name] = crawler | ||||||
|  |  | ||||||
|  |             if self._config.default_section.share_cookies(): | ||||||
|  |                 if isinstance(crawler, KitIliasWebCrawler): | ||||||
|  |                     crawler.share_cookies(kit_ilias_web_paths) | ||||||
|  |  | ||||||
|  |     def debug_transforms(self) -> None: | ||||||
|  |         for name in self._crawlers_to_run: | ||||||
|  |             crawler = self._crawlers[name] | ||||||
|  |             log.print("") | ||||||
|  |             log.print(f"[bold bright_cyan]Debugging transforms[/] for {escape(name)}") | ||||||
|  |             crawler.debug_transforms() | ||||||
|  |  | ||||||
|  |     async def run(self, debug_transforms: bool) -> None: | ||||||
|  |         """ | ||||||
|  |         May throw ConfigOptionError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # These two functions must run inside the same event loop as the | ||||||
|  |         # crawlers, so that any new objects (like Conditions or Futures) can | ||||||
|  |         # obtain the correct event loop. | ||||||
|  |         self._load_authenticators() | ||||||
|  |         self._load_crawlers() | ||||||
|  |  | ||||||
|  |         if debug_transforms: | ||||||
|  |             log.output_explain = True | ||||||
|  |             log.output_report = False | ||||||
|  |             self.debug_transforms() | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         log.print("") | ||||||
|  |  | ||||||
|  |         for name in self._crawlers_to_run: | ||||||
|  |             crawler = self._crawlers[name] | ||||||
|  |  | ||||||
|  |             log.print(f"[bold bright_cyan]Running[/] {escape(name)}") | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 await crawler.run() | ||||||
|  |             except (CrawlError, AuthError) as e: | ||||||
|  |                 log.error(str(e)) | ||||||
|  |             except Exception: | ||||||
|  |                 log.unexpected_exception() | ||||||
|  |  | ||||||
|  |     def print_report(self) -> None: | ||||||
|  |         for name in self._crawlers_to_run: | ||||||
|  |             crawler = self._crawlers.get(name) | ||||||
|  |             if crawler is None: | ||||||
|  |                 continue  # Crawler failed to load | ||||||
|  |  | ||||||
|  |             log.report("") | ||||||
|  |             log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") | ||||||
|  |  | ||||||
|  |             def fmt_path_link(relative_path: PurePath) -> str: | ||||||
|  |                 # We need to URL-encode the path because it might contain spaces or special characters | ||||||
|  |                 link = crawler.output_dir.resolve(relative_path).absolute().as_uri() | ||||||
|  |                 return f"[link={link}]{fmt_path(relative_path)}[/link]" | ||||||
|  |  | ||||||
|  |             something_changed = False | ||||||
|  |             for path in sorted(crawler.report.added_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_green]Added[/] {fmt_path_link(path)}") | ||||||
|  |             for path in sorted(crawler.report.changed_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_yellow]Changed[/] {fmt_path_link(path)}") | ||||||
|  |             for path in sorted(crawler.report.deleted_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") | ||||||
|  |             for path in sorted(crawler.report.not_deleted_files): | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report_not_deleted(f"  [bold bright_magenta]Not deleted[/] {fmt_path_link(path)}") | ||||||
|  |  | ||||||
|  |             for warning in crawler.report.encountered_warnings: | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_red]Warning[/] {warning}") | ||||||
|  |  | ||||||
|  |             for error in crawler.report.encountered_errors: | ||||||
|  |                 something_changed = True | ||||||
|  |                 log.report(f"  [bold bright_red]Error[/] {error}") | ||||||
|  |  | ||||||
|  |             if not something_changed: | ||||||
|  |                 log.report("  Nothing changed") | ||||||
							
								
								
									
										229
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										229
									
								
								PFERD/report.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,229 @@ | |||||||
|  | import json | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from typing import Any, Dict, List, Optional, Set | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ReportLoadError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class MarkDuplicateError(Exception): | ||||||
|  |     """ | ||||||
|  |     Tried to mark a file that was already marked. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, path: PurePath): | ||||||
|  |         super().__init__(f"A previous file already used path {path}") | ||||||
|  |         self.path = path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class MarkConflictError(Exception): | ||||||
|  |     """ | ||||||
|  |     Marking the path would have caused a conflict. | ||||||
|  |  | ||||||
|  |     A conflict can have two reasons: Either the new file has the same path as | ||||||
|  |     the parent directory of a known file, or a parent directory of the new file | ||||||
|  |     has the same path as a known file. In either case, adding the new file | ||||||
|  |     would require a file and a directory to share the same path, which is | ||||||
|  |     usually not possible. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, path: PurePath, collides_with: PurePath): | ||||||
|  |         super().__init__(f"File at {path} collides with previous file at {collides_with}") | ||||||
|  |         self.path = path | ||||||
|  |         self.collides_with = collides_with | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Report: | ||||||
|  |     """ | ||||||
|  |     A report of a synchronization. Includes all files found by the crawler, as | ||||||
|  |     well as the set of changes made to local files. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self) -> None: | ||||||
|  |         # Paths found by the crawler, untransformed | ||||||
|  |         self.found_paths: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         # Files reserved for metadata files (e. g. the report file or cookies) | ||||||
|  |         # that can't be overwritten by user transforms and won't be cleaned up | ||||||
|  |         # at the end. | ||||||
|  |         self.reserved_files: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         # Files found by the crawler, transformed. Only includes files that | ||||||
|  |         # were downloaded (or a download was attempted) | ||||||
|  |         self.known_files: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         self.added_files: Set[PurePath] = set() | ||||||
|  |         self.changed_files: Set[PurePath] = set() | ||||||
|  |         self.deleted_files: Set[PurePath] = set() | ||||||
|  |         # Files that should have been deleted by the cleanup but weren't | ||||||
|  |         self.not_deleted_files: Set[PurePath] = set() | ||||||
|  |  | ||||||
|  |         # Custom crawler-specific data | ||||||
|  |         self.custom: Dict[str, Any] = dict() | ||||||
|  |  | ||||||
|  |         # Encountered errors and warnings | ||||||
|  |         self.encountered_warnings: List[str] = [] | ||||||
|  |         self.encountered_errors: List[str] = [] | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: | ||||||
|  |         result: Any = data.get(key, []) | ||||||
|  |  | ||||||
|  |         if not isinstance(result, list): | ||||||
|  |             raise ReportLoadError(f"Incorrect format: {key!r} is not a list") | ||||||
|  |  | ||||||
|  |         for elem in result: | ||||||
|  |             if not isinstance(elem, str): | ||||||
|  |                 raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings") | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_str_dictionary(data: Dict[str, Any], key: str) -> Dict[str, Any]: | ||||||
|  |         result: Dict[str, Any] = data.get(key, {}) | ||||||
|  |  | ||||||
|  |         if not isinstance(result, dict): | ||||||
|  |             raise ReportLoadError(f"Incorrect format: {key!r} is not a dictionary") | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def load(cls, path: Path) -> "Report": | ||||||
|  |         """ | ||||||
|  |         May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         with open(path, encoding="utf-8") as f: | ||||||
|  |             data = json.load(f) | ||||||
|  |  | ||||||
|  |         if not isinstance(data, dict): | ||||||
|  |             raise ReportLoadError("Incorrect format: Root is not an object") | ||||||
|  |  | ||||||
|  |         self = cls() | ||||||
|  |         for elem in self._get_list_of_strs(data, "found"): | ||||||
|  |             self.found(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "reserved"): | ||||||
|  |             self.mark_reserved(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "known"): | ||||||
|  |             self.mark(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "added"): | ||||||
|  |             self.add_file(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "changed"): | ||||||
|  |             self.change_file(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "deleted"): | ||||||
|  |             self.delete_file(PurePath(elem)) | ||||||
|  |         for elem in self._get_list_of_strs(data, "not_deleted"): | ||||||
|  |             self.not_delete_file(PurePath(elem)) | ||||||
|  |         self.custom = self._get_str_dictionary(data, "custom") | ||||||
|  |         self.encountered_errors = self._get_list_of_strs(data, "encountered_errors") | ||||||
|  |         self.encountered_warnings = self._get_list_of_strs(data, "encountered_warnings") | ||||||
|  |  | ||||||
|  |         return self | ||||||
|  |  | ||||||
|  |     def store(self, path: Path) -> None: | ||||||
|  |         """ | ||||||
|  |         May raise OSError. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         data = { | ||||||
|  |             "found": [str(path) for path in sorted(self.found_paths)], | ||||||
|  |             "reserved": [str(path) for path in sorted(self.reserved_files)], | ||||||
|  |             "known": [str(path) for path in sorted(self.known_files)], | ||||||
|  |             "added": [str(path) for path in sorted(self.added_files)], | ||||||
|  |             "changed": [str(path) for path in sorted(self.changed_files)], | ||||||
|  |             "deleted": [str(path) for path in sorted(self.deleted_files)], | ||||||
|  |             "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], | ||||||
|  |             "custom": self.custom, | ||||||
|  |             "encountered_warnings": self.encountered_warnings, | ||||||
|  |             "encountered_errors": self.encountered_errors, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         with open(path, "w", encoding="utf-8") as f: | ||||||
|  |             json.dump(data, f, indent=2, sort_keys=True) | ||||||
|  |             f.write("\n")  # json.dump doesn't do this | ||||||
|  |  | ||||||
|  |     def found(self, path: PurePath) -> None: | ||||||
|  |         self.found_paths.add(path) | ||||||
|  |  | ||||||
|  |     def mark_reserved(self, path: PurePath) -> None: | ||||||
|  |         if path in self.marked: | ||||||
|  |             raise RuntimeError("Trying to reserve an already reserved file") | ||||||
|  |  | ||||||
|  |         self.reserved_files.add(path) | ||||||
|  |  | ||||||
|  |     def mark(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Mark a previously unknown file as known. | ||||||
|  |  | ||||||
|  |         May throw a MarkDuplicateError or a MarkConflictError. For more detail, | ||||||
|  |         see the respective exception's docstring. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         for other in self.marked: | ||||||
|  |             if path == other: | ||||||
|  |                 raise MarkDuplicateError(path) | ||||||
|  |  | ||||||
|  |             if path.is_relative_to(other) or other.is_relative_to(path): | ||||||
|  |                 raise MarkConflictError(path, other) | ||||||
|  |  | ||||||
|  |         self.known_files.add(path) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def marked(self) -> Set[PurePath]: | ||||||
|  |         return self.known_files | self.reserved_files | ||||||
|  |  | ||||||
|  |     def is_marked(self, path: PurePath) -> bool: | ||||||
|  |         return path in self.marked | ||||||
|  |  | ||||||
|  |     def add_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.added_files.add(path) | ||||||
|  |  | ||||||
|  |     def change_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.changed_files.add(path) | ||||||
|  |  | ||||||
|  |     def delete_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.deleted_files.add(path) | ||||||
|  |  | ||||||
|  |     def not_delete_file(self, path: PurePath) -> None: | ||||||
|  |         """ | ||||||
|  |         Unlike mark(), this function accepts any paths. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self.not_deleted_files.add(path) | ||||||
|  |  | ||||||
|  |     def add_custom_value(self, key: str, value: Any) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds a custom value under the passed key, overwriting any existing | ||||||
|  |         """ | ||||||
|  |         self.custom[key] = value | ||||||
|  |  | ||||||
|  |     def get_custom_value(self, key: str) -> Optional[Any]: | ||||||
|  |         """ | ||||||
|  |         Retrieves a custom value for the given key. | ||||||
|  |         """ | ||||||
|  |         return self.custom.get(key) | ||||||
|  |  | ||||||
|  |     def add_error(self, error: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds an error to this report's error list. | ||||||
|  |         """ | ||||||
|  |         self.encountered_errors.append(error) | ||||||
|  |  | ||||||
|  |     def add_warning(self, warning: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Adds a warning to this report's warning list. | ||||||
|  |         """ | ||||||
|  |         self.encountered_warnings.append(warning) | ||||||
							
								
								
									
										116
									
								
								PFERD/ti.py
									
									
									
									
									
								
							
							
						
						
									
										116
									
								
								PFERD/ti.py
									
									
									
									
									
								
							| @@ -1,116 +0,0 @@ | |||||||
| # Fakultät für Mathematik (FfM) |  | ||||||
|  |  | ||||||
| import getpass |  | ||||||
| import logging |  | ||||||
| import pathlib |  | ||||||
| import re |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
| __all__ = ["Ti"] |  | ||||||
| logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
| class Ti: |  | ||||||
|     BASE_URL = "http://ti.ira.uka.de/" |  | ||||||
|     FILE_RE = re.compile(r"^.+\.pdf$") |  | ||||||
|  |  | ||||||
|     def __init__(self, base_path): |  | ||||||
|         self.base_path = base_path |  | ||||||
|  |  | ||||||
|         self._session = requests.Session() |  | ||||||
|         self._credentials = None |  | ||||||
|  |  | ||||||
|     def synchronize(self, urlpart, to_dir, transform=lambda x: x, |  | ||||||
|             filter=lambda x: True): |  | ||||||
|         logger.info(f"    Synchronizing {urlpart} to {to_dir} using the Ti synchronizer.") |  | ||||||
|  |  | ||||||
|         sync_path = pathlib.Path(self.base_path, to_dir) |  | ||||||
|  |  | ||||||
|         orga = Organizer(self.base_path, sync_path) |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|         self._reset_credentials() |  | ||||||
|  |  | ||||||
|         available = self._find_available(urlpart) |  | ||||||
|  |  | ||||||
|         if "Folien" in available: |  | ||||||
|             path = pathlib.PurePath("Folien") |  | ||||||
|             if filter(path): |  | ||||||
|                 self._crawl(urlpart + available["Folien"], path, orga, |  | ||||||
|                         transform) |  | ||||||
|             else: |  | ||||||
|                 logger.info("Skipping Folien/") |  | ||||||
|  |  | ||||||
|         if "Blätter" in available: |  | ||||||
|             path = pathlib.PurePath("Blätter") |  | ||||||
|             if filter(path): |  | ||||||
|                 self._crawl(urlpart + available["Blätter"], path, orga, |  | ||||||
|                         transform) |  | ||||||
|             else: |  | ||||||
|                 logger.info("Skipping Blätter/") |  | ||||||
|  |  | ||||||
|         orga.clean_sync_dir() |  | ||||||
|         orga.clean_temp_dir() |  | ||||||
|  |  | ||||||
|         self._reset_credentials() |  | ||||||
|  |  | ||||||
|     def _find_available(self, urlpart): |  | ||||||
|         url = self.BASE_URL + urlpart |  | ||||||
|         r = self._session.get(url) |  | ||||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|  |  | ||||||
|         available = {} |  | ||||||
|  |  | ||||||
|         if soup.find(href="./Vorlesung/Vorlesung.php"): |  | ||||||
|             logger.info("Found Folien/") |  | ||||||
|             available["Folien"] = "/Vorlesung/" |  | ||||||
|         if soup.find(href="./Uebungen/Uebungen.php"): |  | ||||||
|             logger.info("Found Blätter/") |  | ||||||
|             available["Blätter"] = "/Uebungen/" |  | ||||||
|  |  | ||||||
|         return available |  | ||||||
|  |  | ||||||
|     def _crawl(self, urlpart, path, orga, transform): |  | ||||||
|         url = self.BASE_URL + urlpart |  | ||||||
|         r = self._session.get(url) |  | ||||||
|         soup = bs4.BeautifulSoup(r.text, "html.parser") |  | ||||||
|  |  | ||||||
|         for filelink in soup.find_all("a", href=self.FILE_RE): |  | ||||||
|             filepath = path / filelink["href"] |  | ||||||
|             fileurl = url + "/" + filelink["href"] |  | ||||||
|  |  | ||||||
|             new_path = transform(filepath) |  | ||||||
|             if new_path is None: |  | ||||||
|                 continue |  | ||||||
|             logger.debug(f"Transformed from {filepath} to {new_path}") |  | ||||||
|  |  | ||||||
|             temp_path = orga.temp_file() |  | ||||||
|             self._download(fileurl, temp_path) |  | ||||||
|             orga.add_file(temp_path, new_path) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     def _get_credentials(self): |  | ||||||
|         if self._credentials is None: |  | ||||||
|             print("Please enter Ti credentials.") |  | ||||||
|             username = getpass.getpass(prompt="Username: ") |  | ||||||
|             password = getpass.getpass(prompt="Password: ") |  | ||||||
|             self._credentials = (username, password) |  | ||||||
|         return self._credentials |  | ||||||
|  |  | ||||||
|     def _reset_credentials(self): |  | ||||||
|         self._credentials = None |  | ||||||
|  |  | ||||||
|     def _download(self, url, to_path): |  | ||||||
|         while True: |  | ||||||
|             username, password = self._get_credentials() |  | ||||||
|             with self._session.get(url, stream=True, auth=(username, password)) as r: |  | ||||||
|                 if r.ok: |  | ||||||
|                     stream_to_path(r, to_path) |  | ||||||
|                     return |  | ||||||
|                 else: |  | ||||||
|                     print("Incorrect credentials.") |  | ||||||
|                     self._reset_credentials() |  | ||||||
							
								
								
									
										443
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										443
									
								
								PFERD/transformer.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,443 @@ | |||||||
|  | import ast | ||||||
|  | import re | ||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from enum import Enum | ||||||
|  | from pathlib import PurePath | ||||||
|  | from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union | ||||||
|  |  | ||||||
|  | from .logging import log | ||||||
|  | from .utils import fmt_path, str_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ArrowHead(Enum): | ||||||
|  |     NORMAL = 0 | ||||||
|  |     SEQUENCE = 1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Ignore: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Empty: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | RightSide = Union[str, Ignore, Empty] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Transformed: | ||||||
|  |     path: PurePath | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Ignored: | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | TransformResult = Optional[Union[Transformed, Ignored]] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Rule: | ||||||
|  |     left: str | ||||||
|  |     left_index: int | ||||||
|  |     name: str | ||||||
|  |     head: ArrowHead | ||||||
|  |     right: RightSide | ||||||
|  |     right_index: int | ||||||
|  |  | ||||||
|  |     def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: | ||||||
|  |         if isinstance(self.right, str): | ||||||
|  |             return self.right | ||||||
|  |         elif isinstance(self.right, Ignore): | ||||||
|  |             return Ignored() | ||||||
|  |         elif isinstance(self.right, Empty): | ||||||
|  |             return Transformed(path) | ||||||
|  |         else: | ||||||
|  |             raise RuntimeError(f"Right side has invalid type {type(self.right)}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Transformation(ABC): | ||||||
|  |     def __init__(self, rule: Rule): | ||||||
|  |         self.rule = rule | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ExactTf(Transformation): | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         if path != PurePath(self.rule.left): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         right = self.rule.right_result(path) | ||||||
|  |         if not isinstance(right, str): | ||||||
|  |             return right | ||||||
|  |  | ||||||
|  |         return Transformed(PurePath(right)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ExactReTf(Transformation): | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         match = re.fullmatch(self.rule.left, str_path(path)) | ||||||
|  |         if not match: | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         right = self.rule.right_result(path) | ||||||
|  |         if not isinstance(right, str): | ||||||
|  |             return right | ||||||
|  |  | ||||||
|  |         # For some reason, mypy thinks that "groups" has type List[str]. But | ||||||
|  |         # since elements of "match.groups()" can be None, mypy is wrong. | ||||||
|  |         groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) | ||||||
|  |  | ||||||
|  |         locals_dir: Dict[str, Union[str, int, float]] = {} | ||||||
|  |         for i, group in enumerate(groups): | ||||||
|  |             if group is None: | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             locals_dir[f"g{i}"] = group | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 locals_dir[f"i{i}"] = int(group) | ||||||
|  |             except ValueError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 locals_dir[f"f{i}"] = float(group) | ||||||
|  |             except ValueError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |         named_groups: Dict[str, str] = match.groupdict() | ||||||
|  |         for name, capture in named_groups.items(): | ||||||
|  |             locals_dir[name] = capture | ||||||
|  |  | ||||||
|  |         result = eval(f"f{right!r}", {}, locals_dir) | ||||||
|  |         return Transformed(PurePath(result)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RenamingParentsTf(Transformation): | ||||||
|  |     def __init__(self, sub_tf: Transformation): | ||||||
|  |         super().__init__(sub_tf.rule) | ||||||
|  |         self.sub_tf = sub_tf | ||||||
|  |  | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         for i in range(len(path.parts), -1, -1): | ||||||
|  |             parent = PurePath(*path.parts[:i]) | ||||||
|  |             child = PurePath(*path.parts[i:]) | ||||||
|  |  | ||||||
|  |             transformed = self.sub_tf.transform(parent) | ||||||
|  |             if not transformed: | ||||||
|  |                 continue | ||||||
|  |             elif isinstance(transformed, Transformed): | ||||||
|  |                 return Transformed(transformed.path / child) | ||||||
|  |             elif isinstance(transformed, Ignored): | ||||||
|  |                 return transformed | ||||||
|  |             else: | ||||||
|  |                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||||
|  |  | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RenamingPartsTf(Transformation): | ||||||
|  |     def __init__(self, sub_tf: Transformation): | ||||||
|  |         super().__init__(sub_tf.rule) | ||||||
|  |         self.sub_tf = sub_tf | ||||||
|  |  | ||||||
|  |     def transform(self, path: PurePath) -> TransformResult: | ||||||
|  |         result = PurePath() | ||||||
|  |         any_part_matched = False | ||||||
|  |         for part in path.parts: | ||||||
|  |             transformed = self.sub_tf.transform(PurePath(part)) | ||||||
|  |             if not transformed: | ||||||
|  |                 result /= part | ||||||
|  |             elif isinstance(transformed, Transformed): | ||||||
|  |                 result /= transformed.path | ||||||
|  |                 any_part_matched = True | ||||||
|  |             elif isinstance(transformed, Ignored): | ||||||
|  |                 return transformed | ||||||
|  |             else: | ||||||
|  |                 raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") | ||||||
|  |  | ||||||
|  |         if any_part_matched: | ||||||
|  |             return Transformed(result) | ||||||
|  |         else: | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RuleParseError(Exception): | ||||||
|  |     def __init__(self, line: "Line", reason: str): | ||||||
|  |         super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}") | ||||||
|  |  | ||||||
|  |         self.line = line | ||||||
|  |         self.reason = reason | ||||||
|  |  | ||||||
|  |     def pretty_print(self) -> None: | ||||||
|  |         log.error(f"Error parsing rule on line {self.line.line_nr}:") | ||||||
|  |         log.error_contd(self.line.line) | ||||||
|  |         spaces = " " * self.line.index | ||||||
|  |         log.error_contd(f"{spaces}^--- {self.reason}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | T = TypeVar("T") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Line: | ||||||
|  |     def __init__(self, line: str, line_nr: int): | ||||||
|  |         self._line = line | ||||||
|  |         self._line_nr = line_nr | ||||||
|  |         self._index = 0 | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def line(self) -> str: | ||||||
|  |         return self._line | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def line_nr(self) -> int: | ||||||
|  |         return self._line_nr | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def index(self) -> int: | ||||||
|  |         return self._index | ||||||
|  |  | ||||||
|  |     @index.setter | ||||||
|  |     def index(self, index: int) -> None: | ||||||
|  |         self._index = index | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def rest(self) -> str: | ||||||
|  |         return self.line[self.index:] | ||||||
|  |  | ||||||
|  |     def peek(self, amount: int = 1) -> str: | ||||||
|  |         return self.rest[:amount] | ||||||
|  |  | ||||||
|  |     def take(self, amount: int = 1) -> str: | ||||||
|  |         string = self.peek(amount) | ||||||
|  |         self.index += len(string) | ||||||
|  |         return string | ||||||
|  |  | ||||||
|  |     def expect(self, string: str) -> str: | ||||||
|  |         if self.peek(len(string)) == string: | ||||||
|  |             return self.take(len(string)) | ||||||
|  |         else: | ||||||
|  |             raise RuleParseError(self, f"Expected {string!r}") | ||||||
|  |  | ||||||
|  |     def expect_with(self, string: str, value: T) -> T: | ||||||
|  |         self.expect(string) | ||||||
|  |         return value | ||||||
|  |  | ||||||
|  |     def one_of(self, parsers: List[Callable[[], T]], description: str) -> T: | ||||||
|  |         for parser in parsers: | ||||||
|  |             index = self.index | ||||||
|  |             try: | ||||||
|  |                 return parser() | ||||||
|  |             except RuleParseError: | ||||||
|  |                 self.index = index | ||||||
|  |  | ||||||
|  |         raise RuleParseError(self, description) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # RULE = LEFT SPACE '-' NAME '-' HEAD (SPACE RIGHT)? | ||||||
|  | # SPACE = ' '+ | ||||||
|  | # NAME = '' | 'exact' | 'name' | 're' | 'exact-re' | 'name-re' | ||||||
|  | # HEAD = '>' | '>>' | ||||||
|  | # LEFT = STR | QUOTED_STR | ||||||
|  | # RIGHT = STR | QUOTED_STR | '!' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_zero_or_more_spaces(line: Line) -> None: | ||||||
|  |     while line.peek() == " ": | ||||||
|  |         line.take() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_one_or_more_spaces(line: Line) -> None: | ||||||
|  |     line.expect(" ") | ||||||
|  |     parse_zero_or_more_spaces(line) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_str(line: Line) -> str: | ||||||
|  |     result = [] | ||||||
|  |     while c := line.peek(): | ||||||
|  |         if c == " ": | ||||||
|  |             break | ||||||
|  |         else: | ||||||
|  |             line.take() | ||||||
|  |             result.append(c) | ||||||
|  |  | ||||||
|  |     if result: | ||||||
|  |         return "".join(result) | ||||||
|  |     else: | ||||||
|  |         raise RuleParseError(line, "Expected non-space character") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | QUOTATION_MARKS = {'"', "'"} | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_quoted_str(line: Line) -> str: | ||||||
|  |     escaped = False | ||||||
|  |  | ||||||
|  |     # Points to first character of string literal | ||||||
|  |     start_index = line.index | ||||||
|  |  | ||||||
|  |     quotation_mark = line.peek() | ||||||
|  |     if quotation_mark not in QUOTATION_MARKS: | ||||||
|  |         raise RuleParseError(line, "Expected quotation mark") | ||||||
|  |     line.take() | ||||||
|  |  | ||||||
|  |     while c := line.peek(): | ||||||
|  |         if escaped: | ||||||
|  |             escaped = False | ||||||
|  |             line.take() | ||||||
|  |         elif c == quotation_mark: | ||||||
|  |             line.take() | ||||||
|  |             stop_index = line.index | ||||||
|  |             literal = line.line[start_index:stop_index] | ||||||
|  |             try: | ||||||
|  |                 return ast.literal_eval(literal) | ||||||
|  |             except SyntaxError as e: | ||||||
|  |                 line.index = start_index | ||||||
|  |                 raise RuleParseError(line, str(e)) from e | ||||||
|  |         elif c == "\\": | ||||||
|  |             escaped = True | ||||||
|  |             line.take() | ||||||
|  |         else: | ||||||
|  |             line.take() | ||||||
|  |  | ||||||
|  |     raise RuleParseError(line, "Expected end of string literal") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_left(line: Line) -> str: | ||||||
|  |     if line.peek() in QUOTATION_MARKS: | ||||||
|  |         return parse_quoted_str(line) | ||||||
|  |     else: | ||||||
|  |         return parse_str(line) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_right(line: Line) -> Union[str, Ignore]: | ||||||
|  |     c = line.peek() | ||||||
|  |     if c in QUOTATION_MARKS: | ||||||
|  |         return parse_quoted_str(line) | ||||||
|  |     else: | ||||||
|  |         string = parse_str(line) | ||||||
|  |         if string == "!": | ||||||
|  |             return Ignore() | ||||||
|  |         return string | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_arrow_name(line: Line) -> str: | ||||||
|  |     return line.one_of([ | ||||||
|  |         lambda: line.expect("exact-re"), | ||||||
|  |         lambda: line.expect("exact"), | ||||||
|  |         lambda: line.expect("name-re"), | ||||||
|  |         lambda: line.expect("name"), | ||||||
|  |         lambda: line.expect("re"), | ||||||
|  |         lambda: line.expect(""), | ||||||
|  |     ], "Expected arrow name") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_arrow_head(line: Line) -> ArrowHead: | ||||||
|  |     return line.one_of([ | ||||||
|  |         lambda: line.expect_with(">>", ArrowHead.SEQUENCE), | ||||||
|  |         lambda: line.expect_with(">", ArrowHead.NORMAL), | ||||||
|  |     ], "Expected arrow head") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_eol(line: Line) -> None: | ||||||
|  |     if line.peek(): | ||||||
|  |         raise RuleParseError(line, "Expected end of line") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_rule(line: Line) -> Rule: | ||||||
|  |     parse_zero_or_more_spaces(line) | ||||||
|  |     left_index = line.index | ||||||
|  |     left = parse_left(line) | ||||||
|  |  | ||||||
|  |     parse_one_or_more_spaces(line) | ||||||
|  |  | ||||||
|  |     line.expect("-") | ||||||
|  |     name = parse_arrow_name(line) | ||||||
|  |     line.expect("-") | ||||||
|  |     head = parse_arrow_head(line) | ||||||
|  |  | ||||||
|  |     right_index = line.index | ||||||
|  |     right: RightSide | ||||||
|  |     try: | ||||||
|  |         parse_zero_or_more_spaces(line) | ||||||
|  |         parse_eol(line) | ||||||
|  |         right = Empty() | ||||||
|  |     except RuleParseError: | ||||||
|  |         line.index = right_index | ||||||
|  |         parse_one_or_more_spaces(line) | ||||||
|  |         right = parse_right(line) | ||||||
|  |         parse_eol(line) | ||||||
|  |  | ||||||
|  |     return Rule(left, left_index, name, head, right, right_index) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def parse_transformation(line: Line) -> Transformation: | ||||||
|  |     rule = parse_rule(line) | ||||||
|  |  | ||||||
|  |     if rule.name == "": | ||||||
|  |         return RenamingParentsTf(ExactTf(rule)) | ||||||
|  |     elif rule.name == "exact": | ||||||
|  |         return ExactTf(rule) | ||||||
|  |     elif rule.name == "name": | ||||||
|  |         if len(PurePath(rule.left).parts) > 1: | ||||||
|  |             line.index = rule.left_index | ||||||
|  |             raise RuleParseError(line, "Expected name, not multiple segments") | ||||||
|  |         return RenamingPartsTf(ExactTf(rule)) | ||||||
|  |     elif rule.name == "re": | ||||||
|  |         return RenamingParentsTf(ExactReTf(rule)) | ||||||
|  |     elif rule.name == "exact-re": | ||||||
|  |         return ExactReTf(rule) | ||||||
|  |     elif rule.name == "name-re": | ||||||
|  |         return RenamingPartsTf(ExactReTf(rule)) | ||||||
|  |     else: | ||||||
|  |         raise RuntimeError(f"Invalid arrow name {rule.name!r}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Transformer: | ||||||
|  |     def __init__(self, rules: str): | ||||||
|  |         """ | ||||||
|  |         May throw a RuleParseException. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         self._tfs = [] | ||||||
|  |         for i, line in enumerate(rules.split("\n")): | ||||||
|  |             line = line.strip() | ||||||
|  |             if line: | ||||||
|  |                 tf = parse_transformation(Line(line, i)) | ||||||
|  |                 self._tfs.append((line, tf)) | ||||||
|  |  | ||||||
|  |     def transform(self, path: PurePath) -> Optional[PurePath]: | ||||||
|  |         for i, (line, tf) in enumerate(self._tfs): | ||||||
|  |             log.explain(f"Testing rule {i+1}: {line}") | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 result = tf.transform(path) | ||||||
|  |             except Exception as e: | ||||||
|  |                 log.warn(f"Error while testing rule {i+1}: {line}") | ||||||
|  |                 log.warn_contd(str(e)) | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             if not result: | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             if isinstance(result, Ignored): | ||||||
|  |                 log.explain("Match found, path ignored") | ||||||
|  |                 return None | ||||||
|  |  | ||||||
|  |             if tf.rule.head == ArrowHead.NORMAL: | ||||||
|  |                 log.explain(f"Match found, transformed path to {fmt_path(result.path)}") | ||||||
|  |                 path = result.path | ||||||
|  |                 break | ||||||
|  |             elif tf.rule.head == ArrowHead.SEQUENCE: | ||||||
|  |                 log.explain(f"Match found, updated path to {fmt_path(result.path)}") | ||||||
|  |                 path = result.path | ||||||
|  |             else: | ||||||
|  |                 raise RuntimeError(f"Invalid transform result of type {type(result)}: {result}") | ||||||
|  |  | ||||||
|  |         log.explain(f"Final result: {fmt_path(path)}") | ||||||
|  |         return path | ||||||
							
								
								
									
										161
									
								
								PFERD/utils.py
									
									
									
									
									
								
							
							
						
						
									
										161
									
								
								PFERD/utils.py
									
									
									
									
									
								
							| @@ -1,33 +1,144 @@ | |||||||
| import os | import asyncio | ||||||
| import pathlib | import getpass | ||||||
|  | import sys | ||||||
|  | import threading | ||||||
|  | from abc import ABC, abstractmethod | ||||||
|  | from contextlib import AsyncExitStack | ||||||
|  | from pathlib import Path, PurePath | ||||||
|  | from types import TracebackType | ||||||
|  | from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar | ||||||
|  | from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit | ||||||
|  |  | ||||||
| __all__ = [ | import bs4 | ||||||
|     "get_base_dir", |  | ||||||
|     "move", |  | ||||||
|     "rename", |  | ||||||
|     "stream_to_path", |  | ||||||
|     "ContentTypeException", |  | ||||||
|     "FileNotFoundException", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| def get_base_dir(script_file): | T = TypeVar("T") | ||||||
|     return pathlib.Path(os.path.dirname(os.path.abspath(script_file))) |  | ||||||
|  |  | ||||||
| def move(path, from_folders, to_folders): |  | ||||||
|     l = len(from_folders) |  | ||||||
|     if path.parts[:l] == from_folders: |  | ||||||
|         return pathlib.PurePath(*to_folders, *path.parts[l:]) |  | ||||||
|  |  | ||||||
| def rename(path, to_name): | async def in_daemon_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: | ||||||
|     return pathlib.PurePath(*path.parts[:-1], to_name) |     loop = asyncio.get_running_loop() | ||||||
|  |     future: asyncio.Future[T] = asyncio.Future() | ||||||
|  |  | ||||||
| def stream_to_path(response, to_path, chunk_size=1024**2): |     def thread_func() -> None: | ||||||
|     with open(to_path, 'wb') as fd: |         result = func() | ||||||
|         for chunk in response.iter_content(chunk_size=chunk_size): |         loop.call_soon_threadsafe(future.set_result, result) | ||||||
|             fd.write(chunk) |  | ||||||
|  |  | ||||||
| class ContentTypeException(Exception): |     threading.Thread(target=thread_func, daemon=True).start() | ||||||
|  |  | ||||||
|  |     return await future | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def ainput(prompt: str) -> str: | ||||||
|  |     return await in_daemon_thread(lambda: input(prompt)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def agetpass(prompt: str) -> str: | ||||||
|  |     return await in_daemon_thread(lambda: getpass.getpass(prompt)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: | ||||||
|  |     """ | ||||||
|  |     Asks the user a yes/no question and returns their choice. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     if default is True: | ||||||
|  |         query += " [Y/n] " | ||||||
|  |     elif default is False: | ||||||
|  |         query += " [y/N] " | ||||||
|  |     else: | ||||||
|  |         query += " [y/n] " | ||||||
|  |  | ||||||
|  |     while True: | ||||||
|  |         response = (await ainput(query)).strip().lower() | ||||||
|  |         if response == "y": | ||||||
|  |             return True | ||||||
|  |         elif response == "n": | ||||||
|  |             return False | ||||||
|  |         elif response == "" and default is not None: | ||||||
|  |             return default | ||||||
|  |  | ||||||
|  |         print("Please answer with 'y' or 'n'.") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def soupify(data: bytes) -> bs4.BeautifulSoup: | ||||||
|  |     """ | ||||||
|  |     Parses HTML to a beautifulsoup object. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     return bs4.BeautifulSoup(data, "html.parser") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def url_set_query_param(url: str, param: str, value: str) -> str: | ||||||
|  |     """ | ||||||
|  |     Set a query parameter in an url, overwriting existing ones with the same name. | ||||||
|  |     """ | ||||||
|  |     scheme, netloc, path, query, fragment = urlsplit(url) | ||||||
|  |     query_parameters = parse_qs(query) | ||||||
|  |     query_parameters[param] = [value] | ||||||
|  |     new_query_string = urlencode(query_parameters, doseq=True) | ||||||
|  |  | ||||||
|  |     return urlunsplit((scheme, netloc, path, new_query_string, fragment)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def url_set_query_params(url: str, params: Dict[str, str]) -> str: | ||||||
|  |     """ | ||||||
|  |     Sets multiple query parameters in an url, overwriting existing ones. | ||||||
|  |     """ | ||||||
|  |     result = url | ||||||
|  |  | ||||||
|  |     for key, val in params.items(): | ||||||
|  |         result = url_set_query_param(result, key, val) | ||||||
|  |  | ||||||
|  |     return result | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def str_path(path: PurePath) -> str: | ||||||
|  |     if not path.parts: | ||||||
|  |         return "." | ||||||
|  |     return "/".join(path.parts) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fmt_path(path: PurePath) -> str: | ||||||
|  |     return repr(str_path(path)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fmt_real_path(path: Path) -> str: | ||||||
|  |     return repr(str(path.absolute())) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ReusableAsyncContextManager(ABC, Generic[T]): | ||||||
|  |     def __init__(self) -> None: | ||||||
|  |         self._active = False | ||||||
|  |         self._stack = AsyncExitStack() | ||||||
|  |  | ||||||
|  |     @abstractmethod | ||||||
|  |     async def _on_aenter(self) -> T: | ||||||
|         pass |         pass | ||||||
|  |  | ||||||
| class FileNotFoundException(Exception): |     async def __aenter__(self) -> T: | ||||||
|     pass |         if self._active: | ||||||
|  |             raise RuntimeError("Nested or otherwise concurrent usage is not allowed") | ||||||
|  |  | ||||||
|  |         self._active = True | ||||||
|  |         await self._stack.__aenter__() | ||||||
|  |  | ||||||
|  |         # See https://stackoverflow.com/a/13075071 | ||||||
|  |         try: | ||||||
|  |             result: T = await self._on_aenter() | ||||||
|  |         except:  # noqa: E722 do not use bare 'except' | ||||||
|  |             if not await self.__aexit__(*sys.exc_info()): | ||||||
|  |                 raise | ||||||
|  |  | ||||||
|  |         return result | ||||||
|  |  | ||||||
|  |     async def __aexit__( | ||||||
|  |             self, | ||||||
|  |             exc_type: Optional[Type[BaseException]], | ||||||
|  |             exc_value: Optional[BaseException], | ||||||
|  |             traceback: Optional[TracebackType], | ||||||
|  |     ) -> Optional[bool]: | ||||||
|  |         if not self._active: | ||||||
|  |             raise RuntimeError("__aexit__ called too many times") | ||||||
|  |  | ||||||
|  |         result = await self._stack.__aexit__(exc_type, exc_value, traceback) | ||||||
|  |         self._active = False | ||||||
|  |         return result | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								PFERD/version.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | NAME = "PFERD" | ||||||
|  | VERSION = "3.8.2" | ||||||
							
								
								
									
										160
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										160
									
								
								README.md
									
									
									
									
									
								
							| @@ -2,39 +2,157 @@ | |||||||
|  |  | ||||||
| **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien | **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien | ||||||
|  |  | ||||||
|  | Other resources: | ||||||
|  |  | ||||||
|  | - [Config file format](CONFIG.md) | ||||||
|  | - [Changelog](CHANGELOG.md) | ||||||
|  | - [Development Guide](DEV.md) | ||||||
|  |  | ||||||
| ## Installation | ## Installation | ||||||
|  |  | ||||||
| Ensure that you have at least Python 3.7 installed (3.6 might also work, didn't | ### Direct download | ||||||
| test it though). |  | ||||||
|  | Binaries for Linux, Windows and Mac can be downloaded directly from the | ||||||
|  | [latest release](https://github.com/Garmelon/PFERD/releases/latest). | ||||||
|  |  | ||||||
|  | ### With pip | ||||||
|  |  | ||||||
|  | Ensure you have at least Python 3.11 installed. Run the following command to | ||||||
|  | install PFERD or upgrade it to the latest version: | ||||||
|  |  | ||||||
| To install PFERD or update your installation to the latest version, run this |  | ||||||
| wherever you want to install/have installed PFERD: |  | ||||||
| ``` | ``` | ||||||
| $ pip install git+https://github.com/Garmelon/PFERD@v1.1.1 | $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. | The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. | ||||||
|  |  | ||||||
| ## Example setup | ### With package managers | ||||||
|  |  | ||||||
| In this example, `python3` refers to at least Python 3.7. | Unofficial packages are available for: | ||||||
|  | - [AUR](https://aur.archlinux.org/packages/pferd) | ||||||
|  | - [brew](https://formulae.brew.sh/formula/pferd) | ||||||
|  | - [conda-forge](https://github.com/conda-forge/pferd-feedstock) | ||||||
|  | - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) | ||||||
|  | - [PyPi](https://pypi.org/project/pferd) | ||||||
|  |  | ||||||
|  | See also PFERD's [repology page](https://repology.org/project/pferd/versions). | ||||||
|  |  | ||||||
|  | ## Basic usage | ||||||
|  |  | ||||||
|  | PFERD can be run directly from the command line with no config file. Run `pferd | ||||||
|  | -h` to get an overview of available commands and options. Run `pferd <command> | ||||||
|  | -h` to see which options a command has. | ||||||
|  |  | ||||||
|  | For example, you can download your personal desktop from the KIT ILIAS like | ||||||
|  | this: | ||||||
|  |  | ||||||
| A full example setup and initial use could look like: |  | ||||||
| ``` | ``` | ||||||
| $ mkdir Vorlesungen | $ pferd kit-ilias-web desktop <output_directory> | ||||||
| $ cd Vorlesungen |  | ||||||
| $ python3 -m venv . |  | ||||||
| $ . bin/activate |  | ||||||
| $ pip install git+https://github.com/Garmelon/PFERD@v1.1.1 |  | ||||||
| $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/master/example_config.py |  | ||||||
| $ python3 example_config.py |  | ||||||
| $ deactivate |  | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| Subsequent runs of the program might look like: | Also, you can download most ILIAS pages directly like this: | ||||||
|  |  | ||||||
| ``` | ``` | ||||||
| $ cd Vorlesungen | $ pferd kit-ilias-web <url> <output_directory> | ||||||
| $ . bin/activate | ``` | ||||||
| $ python3 example_config.py |  | ||||||
| $ deactivate | PFERD supports other ILIAS instances as well, using the `ilias-web` crawler (see | ||||||
|  | the [config section on `ilias-web`](CONFIG.md#the-ilias-web-crawler) for more | ||||||
|  | detail on the `base-url` and `client-id` parameters): | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | $ pferd ilias-web \ | ||||||
|  |     --base-url https://ilias.my-university.example \ | ||||||
|  |     --client-id My_University desktop \ | ||||||
|  |     <output_directory> | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | However, the CLI only lets you download a single thing at a time, and the | ||||||
|  | resulting command can grow long quite quickly. Because of this, PFERD can also | ||||||
|  | be used with a config file. | ||||||
|  |  | ||||||
|  | To get started, just take a command you've been using and add `--dump-config` | ||||||
|  | directly after `pferd`, like this: | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | $ pferd --dump-config kit-ilias-web <url> <output_directory> | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | This will make PFERD write its current configuration to its default config file | ||||||
|  | path. You can then run `pferd` without a command and it will execute the config | ||||||
|  | file. Alternatively, you can use `--dump-config-to` and specify a path yourself. | ||||||
|  | Using `--dump-config-to -` will print the configuration to stdout instead of a | ||||||
|  | file, which is a good way to see what is actually going on when using a CLI | ||||||
|  | command. | ||||||
|  |  | ||||||
|  | Another good way to see what PFERD is doing is the `--explain` option. When | ||||||
|  | enabled, PFERD explains in detail what it is doing and why. This can help with | ||||||
|  | debugging your own config. | ||||||
|  |  | ||||||
|  | If you don't want to run all crawlers from your config file, you can specify the | ||||||
|  | crawlers you want to run with `--crawler` or `-C`, like this: | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | $ pferd -C crawler1 -C crawler2 | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ## Advanced usage | ||||||
|  |  | ||||||
|  | PFERD supports lots of different options. For example, you can configure PFERD | ||||||
|  | to [use your system's keyring](CONFIG.md#the-keyring-authenticator) instead of | ||||||
|  | prompting you for your username and password. PFERD also supports | ||||||
|  | [transformation rules](CONFIG.md#transformation-rules) that let you rename or | ||||||
|  | exclude certain files. | ||||||
|  |  | ||||||
|  | For more details, see the comprehensive [config format documentation](CONFIG.md). | ||||||
|  |  | ||||||
|  | ## Example | ||||||
|  |  | ||||||
|  | This example downloads a few courses from the KIT ILIAS with a common keyring | ||||||
|  | authenticator. It reorganizes and ignores some files. | ||||||
|  |  | ||||||
|  | ```ini | ||||||
|  | [DEFAULT] | ||||||
|  | # All paths will be relative to this. | ||||||
|  | # The crawler output directories will be <working_dir>/Foo and <working_dir>/Bar. | ||||||
|  | working_dir = ~/stud | ||||||
|  | # If files vanish from ILIAS the local files are not deleted, allowing us to | ||||||
|  | # take a look at them before deleting them ourselves. | ||||||
|  | on_conflict = no-delete | ||||||
|  |  | ||||||
|  | [auth:ilias] | ||||||
|  | type = keyring | ||||||
|  | username = foo | ||||||
|  |  | ||||||
|  | [crawl:Foo] | ||||||
|  | type = kit-ilias-web | ||||||
|  | auth = auth:ilias | ||||||
|  | # Crawl a course by its ID (found as `ref_id=ID` in the URL) | ||||||
|  | target = 1234567 | ||||||
|  |  | ||||||
|  | # Plaintext files are easier to read by other tools | ||||||
|  | links = plaintext | ||||||
|  |  | ||||||
|  | transform = | ||||||
|  |   # Ignore unneeded folders | ||||||
|  |   Online-Tests --> ! | ||||||
|  |   Vorlesungswerbung --> ! | ||||||
|  |  | ||||||
|  |   # Rename folders | ||||||
|  |   Lehrbücher --> Vorlesung | ||||||
|  |   # Note the ">>" arrow head which lets us apply further rules to files moved to "Übung" | ||||||
|  |   Übungsunterlagen -->> Übung | ||||||
|  |  | ||||||
|  |   # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly | ||||||
|  |   "Übung/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf | ||||||
|  |   # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly | ||||||
|  |   "Übung/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf | ||||||
|  |  | ||||||
|  |   # The course has nested folders with the same name - flatten them | ||||||
|  |   "Übung/(.+?)/\\1" -re-> Übung/{g1} | ||||||
|  |  | ||||||
|  | [crawl:Bar] | ||||||
|  | type = kit-ilias-web | ||||||
|  | auth = auth:ilias | ||||||
|  | target = 1337420 | ||||||
| ``` | ``` | ||||||
|   | |||||||
| @@ -1,342 +0,0 @@ | |||||||
| #!/bin/env python3 |  | ||||||
|  |  | ||||||
| import re |  | ||||||
| import sys |  | ||||||
|  |  | ||||||
| import PFERD |  | ||||||
| from PFERD.utils import get_base_dir, move, rename |  | ||||||
|  |  | ||||||
| #PFERD.enable_logging(logging.DEBUG) |  | ||||||
| PFERD.enable_logging() |  | ||||||
|  |  | ||||||
| base_dir = get_base_dir(__file__) |  | ||||||
|  |  | ||||||
| # Semester 1 |  | ||||||
|  |  | ||||||
| def gbi_filter(path): |  | ||||||
|     # Tutorien rausfiltern |  | ||||||
|     if path.parts[:1] == ("Tutoriumsfolien",): |  | ||||||
|         if path.parts[1:] == (): return True |  | ||||||
|         if path.parts[1:2] == ("Tutorium 15",): return True |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
| def gbi_transform(path): |  | ||||||
|     # Übungsblätter in Blätter/blatt_xx.pdf |  | ||||||
|     new_path = move(path, ("Übungsblätter",), ("Blätter",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|  |  | ||||||
|         match = re.match(r"(\d+).aufgaben.pdf", new_path.name) |  | ||||||
|         if match: |  | ||||||
|             number = int(match.group(1)) |  | ||||||
|             return rename(new_path, f"blatt_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|         match = re.match(r"(\d+).loesungen.pdf", new_path.name) |  | ||||||
|         if match: |  | ||||||
|             number = int(match.group(1)) |  | ||||||
|             return rename(new_path, f"loesung_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Folien in Folien/* |  | ||||||
|     new_path = move(path, ("Vorlesung: Folien",), ("Folien",)) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     # Skripte in Skripte/* |  | ||||||
|     new_path = move(path, ("Vorlesung: Skript",), ("Skripte",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         if new_path.name == "k-21-relationen-skript.pdf": |  | ||||||
|             return rename(new_path, "21-relationen-skript.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Übungsfolien in Übung/* |  | ||||||
|     new_path = move(path, ("große Übung: Folien",), ("Übung",)) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     # Tutoriumsfolien in Tutorium/* |  | ||||||
|     new_path = move(path, ("Tutoriumsfolien","Tutorium 15"), ("Tutorium",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         if new_path.name == "GBI_Tut_2 (1).pdf": |  | ||||||
|             return rename(new_path, "GBI_Tut_2.pdf") |  | ||||||
|         if new_path.name == "GBI_Tut_7 (1).pdf": |  | ||||||
|             return rename(new_path, "GBI_Tut_7.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def hm1_transform(path): |  | ||||||
|     match = re.match(r"blatt(\d+).pdf", path.name) |  | ||||||
|     if match: |  | ||||||
|         new_path = move(path, (), ("Blätter",)) |  | ||||||
|         number = int(match.group(1)) |  | ||||||
|         return rename(new_path, f"blatt_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|     match = re.match(r"blatt(\d+).loesungen.pdf", path.name) |  | ||||||
|     if match: |  | ||||||
|         new_path = move(path, (), ("Blätter",)) |  | ||||||
|         number = int(match.group(1)) |  | ||||||
|         return rename(new_path, f"loesung_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def la1_filter(path): |  | ||||||
|     # Tutorien rausfitern |  | ||||||
|     if path.parts[:1] == ("Tutorien",): |  | ||||||
|         if path.parts[1:] == (): return True |  | ||||||
|         if path.parts[1:2] == ("Tutorium 03 - Philipp Faller",): return True |  | ||||||
|         if path.parts[1:2] == ("Tutorium 23 - Sebastian Faller",): return True |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
| def la1_transform(path): |  | ||||||
|     # Alle Übungsblätter in Blätter/blatt_xx.pdf |  | ||||||
|     # Alles andere Übungsmaterial in Blätter/* |  | ||||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|  |  | ||||||
|         match = re.match(r"Blatt(\d+).pdf", new_path.name) |  | ||||||
|         if match: |  | ||||||
|             number = int(match.group(1)) |  | ||||||
|             return rename(new_path, f"blatt_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|         if new_path.name == "Lösungen zu Blatt 1, Aufgabe 1 und Blatt 2, Aufgabe 4..pdf": |  | ||||||
|             return rename(new_path, "Lösungen zu Blatt 1, Aufgabe 1 und Blatt 2, Aufgabe 4.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Alles Tutoriengedöns von Philipp in Tutorium/Philipp/* |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 03 - Philipp Faller"), ("Tutorium","Philipp")) |  | ||||||
|     if new_path is not None: |  | ||||||
|         if new_path.name == "tut2.pdf": |  | ||||||
|             return rename(new_path, "Tut2.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Alles Tutoriengedöns von Sebastian in Tutorium/Sebastian/* |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 1"), ("Tutorium","Sebastian", "tut01")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 2", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut02.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 3", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut03.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 4", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut04.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 5", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut05.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 6", "aufgaben.pdf"), ("Tutorium","Sebastian", "tut06.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 7", "tut7.pdf"), ("Tutorium","Sebastian", "tut07.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 8", "tut8.pdf"), ("Tutorium","Sebastian", "tut08.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 9", "tut9.pdf"), ("Tutorium","Sebastian", "tut09.pdf")) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     if path.parts == ("Tutorien","Tutorium 23 - Sebastian Faller", "Tutorium 10", "tut10.pdf"): return None |  | ||||||
|  |  | ||||||
|     new_path = move(path, ("Tutorien","Tutorium 23 - Sebastian Faller"), ("Tutorium","Sebastian")) |  | ||||||
|     if new_path is not None: |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Übungs-Gedöns in Übung/* |  | ||||||
|     new_path = move(path, ("Informatikervorlesung", "Übungsfolien"), ("Übung",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         if new_path.name == "Übung_06_ausgewählte Folien.pdf": |  | ||||||
|             return rename(new_path, "Übung_06_ausgewählte_Folien.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Vorlesungsfolien-Gedöns in Folien/* |  | ||||||
|     new_path = move(path, ("Informatikervorlesung", "Folien.Notizen"), ("Folien",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Rest in Hauptverzeichnis |  | ||||||
|     new_path = move(path, ("Informatikervorlesung",), ()) |  | ||||||
|     if new_path is not None: |  | ||||||
|         # Rename filenames that are invalid on FAT systems |  | ||||||
|         if new_path.name == "Evaluationsergebnisse: Übung.pdf": |  | ||||||
|             return rename(new_path, "Evaluationsergebnisse_Übung.pdf") |  | ||||||
|         if new_path.name == "Skript \"Lineare Algebra\" von Stefan Kühnlein.pdf": |  | ||||||
|             return rename(new_path, "Skript Lineare Algebra von Stefan kühnlein.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def prog_filter(path): |  | ||||||
|     # Tutorien rausfiltern |  | ||||||
|     if path.parts[:1] == ("Tutorien",): return False |  | ||||||
|  |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
| def prog_transform(path): |  | ||||||
|     # Übungsblätter in Blätter/* |  | ||||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         if new_path.name == "assignmen04.pdf": |  | ||||||
|             return rename(new_path, "assignment04.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     # Folien in Folien/* |  | ||||||
|     new_path = move(path, ("Vorlesungsmaterial",), ("Folien",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         if new_path.name == "00.1_Begruessung.pdf": |  | ||||||
|             return rename(new_path, "00-01_Begruessung.pdf") |  | ||||||
|         if new_path.name == "00.2_Organisatorisches.pdf": |  | ||||||
|             return rename(new_path, "00-02_Organisatorisches.pdf") |  | ||||||
|         if new_path.name == "01-01_ Einfache-Programme.pdf": |  | ||||||
|             return rename(new_path, "01-01_Einfache_Programme.pdf") |  | ||||||
|         if new_path.name == "13_Finden_und_ Beheben_von_Fehlern.pdf": |  | ||||||
|             return rename(new_path, "13_Finden_und_Beheben_von_Fehlern.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| # Semester 2 |  | ||||||
|  |  | ||||||
| def algo1_filter(path): |  | ||||||
|     # Tutorien rausfiltern |  | ||||||
|     if path.parts[:1] == ("Tutorien",): |  | ||||||
|         if path.parts[1:] == (): return True |  | ||||||
|         #if path.parts[1:2] == ("Tutorium 15",): return True |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
| def algo1_transform(path): |  | ||||||
|     # Folien in Folien/* |  | ||||||
|     new_path = move(path, ("Vorlesungsfolien",), ("Folien",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def hm2_transform(path): |  | ||||||
|     match = re.match(r"blatt(\d+).pdf", path.name) |  | ||||||
|     if match: |  | ||||||
|         new_path = move(path, (), ("Blätter",)) |  | ||||||
|         number = int(match.group(1)) |  | ||||||
|         return rename(new_path, f"blatt_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|     match = re.match(r"blatt(\d+).loesungen.pdf", path.name) |  | ||||||
|     if match: |  | ||||||
|         new_path = move(path, (), ("Blätter",)) |  | ||||||
|         number = int(match.group(1)) |  | ||||||
|         return rename(new_path, f"loesung_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def la2_filter(path): |  | ||||||
|     # Tutorien rausfiltern |  | ||||||
|     if path.parts[:1] == ("Tutorien",): |  | ||||||
|         if path.parts[1:] == (): return True |  | ||||||
|         #if path.parts[1:2] == ("Tutorium 15",): return True |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
| def la2_transform(path): |  | ||||||
|     # Folien in Folien/* |  | ||||||
|     new_path = move(path, ("Vorlesungsmaterial",), ("Folien",)) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     # Alle Übungsblätter in Blätter/blatt_xx.pdf |  | ||||||
|     # Alles andere Übungsmaterial in Blätter/* |  | ||||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) |  | ||||||
|     if new_path is not None: |  | ||||||
|  |  | ||||||
|         match = re.match(r"Blatt(\d+).pdf", new_path.name) |  | ||||||
|         if match: |  | ||||||
|             number = int(match.group(1)) |  | ||||||
|             return rename(new_path, f"blatt_{number:02}.pdf") |  | ||||||
|  |  | ||||||
|         return new_path |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def swt1_filter(path): |  | ||||||
|     # Tutorien rausfiltern |  | ||||||
|     if path.parts[:1] == ("Tutorien",): |  | ||||||
|         if path.parts[1:] == (): return True |  | ||||||
|         #if path.parts[1:2] == ("Tutorium 15",): return True |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
| def swt1_transform(path): |  | ||||||
|     # Folien in Folien/* |  | ||||||
|     new_path = move(path, ("Vorlesungsmaterial",), ("Folien",)) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     # Übungsblätter in Blätter/* |  | ||||||
|     new_path = move(path, ("Übungen",), ("Blätter",)) |  | ||||||
|     if new_path is not None: return new_path |  | ||||||
|  |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| # Main part of the config |  | ||||||
|  |  | ||||||
| def main(args): |  | ||||||
|     args = [arg.lower() for arg in args] |  | ||||||
|  |  | ||||||
|     ffm = PFERD.FfM(base_dir) |  | ||||||
|     ilias = PFERD.Ilias(base_dir, "cookie_jar") |  | ||||||
|     norbert = PFERD.Norbert(base_dir) |  | ||||||
|  |  | ||||||
|     # Semester 1 |  | ||||||
|  |  | ||||||
|     if not args or "gbi" in args: |  | ||||||
|         ilias.synchronize("855240", "GBI", |  | ||||||
|                 transform=gbi_transform, filter=gbi_filter) |  | ||||||
|  |  | ||||||
|     if not args or "hm1" in args: |  | ||||||
|         ffm.synchronize("iana2/lehre/hm1info2018w", "HM1", |  | ||||||
|                 transform=hm1_transform) |  | ||||||
|  |  | ||||||
|     if not args or "la1" in args: |  | ||||||
|         ilias.synchronize("874938", "LA1", |  | ||||||
|                 transform=la1_transform, filter=la1_filter) |  | ||||||
|  |  | ||||||
|     if not args or "prog" in args: |  | ||||||
|         ilias.synchronize("851237", "Prog", |  | ||||||
|                 transform=prog_transform, filter=prog_filter) |  | ||||||
|  |  | ||||||
|     if not args or "norbert" in args: |  | ||||||
|         norbert.synchronize("Prog-Tut") |  | ||||||
|  |  | ||||||
|     # Semester 2 |  | ||||||
|  |  | ||||||
|     if not args or "algo1" in args: |  | ||||||
|         ilias.synchronize("959260", "Algo1", |  | ||||||
|                 transform=algo1_transform, filter=algo1_filter) |  | ||||||
|  |  | ||||||
|     if not args or "hm2" in args: |  | ||||||
|         ffm.synchronize("iana2/lehre/hm2info2019s", "HM2", |  | ||||||
|                 transform=hm2_transform) |  | ||||||
|  |  | ||||||
|     if not args or "la2" in args: |  | ||||||
|         ilias.synchronize("950588", "LA2", |  | ||||||
|                 transform=la2_transform, filter=la2_filter) |  | ||||||
|  |  | ||||||
|     if not args or "swt1" in args: |  | ||||||
|         ilias.synchronize("945596", "SWT1", |  | ||||||
|                 transform=swt1_transform, filter=swt1_filter) |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     args = sys.argv[1:] |  | ||||||
|     main(args) |  | ||||||
							
								
								
									
										27
									
								
								flake.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								flake.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | { | ||||||
|  |   "nodes": { | ||||||
|  |     "nixpkgs": { | ||||||
|  |       "locked": { | ||||||
|  |         "lastModified": 1744440957, | ||||||
|  |         "narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=", | ||||||
|  |         "owner": "NixOS", | ||||||
|  |         "repo": "nixpkgs", | ||||||
|  |         "rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d", | ||||||
|  |         "type": "github" | ||||||
|  |       }, | ||||||
|  |       "original": { | ||||||
|  |         "owner": "NixOS", | ||||||
|  |         "ref": "nixos-24.11", | ||||||
|  |         "repo": "nixpkgs", | ||||||
|  |         "type": "github" | ||||||
|  |       } | ||||||
|  |     }, | ||||||
|  |     "root": { | ||||||
|  |       "inputs": { | ||||||
|  |         "nixpkgs": "nixpkgs" | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   }, | ||||||
|  |   "root": "root", | ||||||
|  |   "version": 7 | ||||||
|  | } | ||||||
							
								
								
									
										41
									
								
								flake.nix
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								flake.nix
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | |||||||
|  | { | ||||||
|  |   description = "Tool for downloading course-related files from ILIAS"; | ||||||
|  |  | ||||||
|  |   inputs = { | ||||||
|  |     nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11"; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   outputs = { self, nixpkgs }: | ||||||
|  |     let | ||||||
|  |       # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. | ||||||
|  |       forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; | ||||||
|  |     in | ||||||
|  |     { | ||||||
|  |       packages = forAllSystems (system: | ||||||
|  |         let pkgs = import nixpkgs { inherit system; }; | ||||||
|  |         in | ||||||
|  |         rec { | ||||||
|  |           default = pkgs.python3Packages.buildPythonApplication rec { | ||||||
|  |             pname = "pferd"; | ||||||
|  |             # Performing black magic | ||||||
|  |             # Don't worry, I sacrificed enough goats for the next few years | ||||||
|  |             version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; | ||||||
|  |             format = "pyproject"; | ||||||
|  |  | ||||||
|  |             src = ./.; | ||||||
|  |  | ||||||
|  |             nativeBuildInputs = with pkgs.python3Packages; [ | ||||||
|  |               setuptools | ||||||
|  |             ]; | ||||||
|  |  | ||||||
|  |             propagatedBuildInputs = with pkgs.python3Packages; [ | ||||||
|  |               aiohttp | ||||||
|  |               beautifulsoup4 | ||||||
|  |               rich | ||||||
|  |               keyring | ||||||
|  |               certifi | ||||||
|  |             ]; | ||||||
|  |           }; | ||||||
|  |         }); | ||||||
|  |     }; | ||||||
|  | } | ||||||
							
								
								
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								pferd.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | # File used by pyinstaller to create the executable | ||||||
|  |  | ||||||
|  | from PFERD.__main__ import main | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										42
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | |||||||
|  | [build-system] | ||||||
|  | requires = ["setuptools", "wheel"] | ||||||
|  | build-backend = "setuptools.build_meta" | ||||||
|  |  | ||||||
|  | [project] | ||||||
|  | name = "PFERD" | ||||||
|  | dependencies = [ | ||||||
|  |   "aiohttp>=3.8.1", | ||||||
|  |   "beautifulsoup4>=4.10.0", | ||||||
|  |   "rich>=11.0.0", | ||||||
|  |   "keyring>=23.5.0", | ||||||
|  |   "certifi>=2021.10.8" | ||||||
|  | ] | ||||||
|  | dynamic = ["version"] | ||||||
|  | requires-python = ">=3.11" | ||||||
|  |  | ||||||
|  | [project.scripts] | ||||||
|  | pferd = "PFERD.__main__:main" | ||||||
|  |  | ||||||
|  | [tool.setuptools.dynamic] | ||||||
|  | version = {attr = "PFERD.version.VERSION"} | ||||||
|  |  | ||||||
|  | [tool.flake8] | ||||||
|  | max-line-length = 110 | ||||||
|  |  | ||||||
|  | [tool.isort] | ||||||
|  | line_length = 110 | ||||||
|  |  | ||||||
|  | [tool.autopep8] | ||||||
|  | max_line_length = 110 | ||||||
|  | in-place = true | ||||||
|  | recursive = true | ||||||
|  |  | ||||||
|  | [tool.mypy] | ||||||
|  | disallow_any_generics = true | ||||||
|  | disallow_untyped_defs = true | ||||||
|  | disallow_incomplete_defs = true | ||||||
|  | no_implicit_optional = true | ||||||
|  | warn_unused_ignores = true | ||||||
|  | warn_unreachable = true | ||||||
|  | show_error_context = true | ||||||
|  | ignore_missing_imports = true | ||||||
							
								
								
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								scripts/build
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,5 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | pyinstaller --onefile pferd.py | ||||||
							
								
								
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										111
									
								
								scripts/bump-version
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,111 @@ | |||||||
|  | #!/usr/bin/env python3 | ||||||
|  |  | ||||||
|  | import argparse | ||||||
|  | import re | ||||||
|  | import time | ||||||
|  | from subprocess import run | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_changelog(): | ||||||
|  |     with open("CHANGELOG.md") as f: | ||||||
|  |         return list(f) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def extract_changes(lines): | ||||||
|  |     lines = iter(lines) | ||||||
|  |     changes = [] | ||||||
|  |  | ||||||
|  |     # Find "Unreleased" section | ||||||
|  |     for line in lines: | ||||||
|  |         if line.strip() == "## Unreleased": | ||||||
|  |             break | ||||||
|  |     next(lines) | ||||||
|  |  | ||||||
|  |     # Read all lines from that section | ||||||
|  |     for line in lines: | ||||||
|  |         if line.startswith("## "): | ||||||
|  |             # Found the beginning of the next section | ||||||
|  |             break | ||||||
|  |         elif line.startswith("### "): | ||||||
|  |             # Found a heading in the current section | ||||||
|  |             # Remove "#" symbols so git doesn't interpret the line as a comment later | ||||||
|  |             changes.append(line[4:]) | ||||||
|  |         else: | ||||||
|  |             changes.append(line) | ||||||
|  |  | ||||||
|  |     # Remove trailing empty lines | ||||||
|  |     while changes and not changes[-1].strip(): | ||||||
|  |         changes.pop() | ||||||
|  |  | ||||||
|  |     return changes | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def update_version(version): | ||||||
|  |     with open("PFERD/version.py") as f: | ||||||
|  |         text = f.read() | ||||||
|  |  | ||||||
|  |     text = re.sub(r'VERSION = ".*"', f'VERSION = "{version}"', text) | ||||||
|  |  | ||||||
|  |     with open("PFERD/version.py", "w") as f: | ||||||
|  |         f.write(text) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def update_changelog(lines, version, date): | ||||||
|  |     lines = iter(lines) | ||||||
|  |     new_lines = [] | ||||||
|  |  | ||||||
|  |     # Find "Unreleased" section | ||||||
|  |     for line in lines: | ||||||
|  |         new_lines.append(line) | ||||||
|  |         if line.strip() == "## Unreleased": | ||||||
|  |             break | ||||||
|  |  | ||||||
|  |     # Add new heading below that | ||||||
|  |     new_lines.append("\n") | ||||||
|  |     new_lines.append(f"## {version} - {date}\n") | ||||||
|  |  | ||||||
|  |     # Add remaining lines | ||||||
|  |     for line in lines: | ||||||
|  |         new_lines.append(line) | ||||||
|  |  | ||||||
|  |     with open("CHANGELOG.md", "w") as f: | ||||||
|  |         f.write("".join(new_lines)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def commit_changes(version): | ||||||
|  |     run(["git", "add", "CHANGELOG.md", "PFERD/version.py"]) | ||||||
|  |     run(["git", "commit", "-m", f"Bump version to {version}"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def create_tag(version, annotation): | ||||||
|  |     run(["git", "tag", "-am", annotation, f"v{version}"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fastforward_latest(): | ||||||
|  |     run(["git", "branch", "-f", "latest", "HEAD"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     parser = argparse.ArgumentParser() | ||||||
|  |     parser.add_argument("version") | ||||||
|  |     args = parser.parse_args() | ||||||
|  |  | ||||||
|  |     version = args.version | ||||||
|  |     date = time.strftime("%Y-%m-%d") | ||||||
|  |     changelog = load_changelog() | ||||||
|  |     changes = extract_changes(changelog) | ||||||
|  |     annotation = f"Version {version} - {date}\n\n{''.join(changes)}" | ||||||
|  |  | ||||||
|  |     update_version(version) | ||||||
|  |     update_changelog(changelog, version, date) | ||||||
|  |     commit_changes(version) | ||||||
|  |     create_tag(version, annotation) | ||||||
|  |     fastforward_latest() | ||||||
|  |  | ||||||
|  |     print() | ||||||
|  |     print("Now the only thing left is to publish the changes:") | ||||||
|  |     print(f"  $ git push origin master latest v{version}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/check
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | mypy . | ||||||
|  | flake8 PFERD | ||||||
							
								
								
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										6
									
								
								scripts/format
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | autopep8 . | ||||||
|  | isort . | ||||||
							
								
								
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								scripts/setup
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,17 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | # Updating pip and setuptools because some older versions don't recognize the | ||||||
|  | # project setup correctly | ||||||
|  | if [[ $1 != '--no-pip' ]]; then | ||||||
|  |     pip install --upgrade pip | ||||||
|  | fi | ||||||
|  | pip install --upgrade setuptools | ||||||
|  |  | ||||||
|  | # Installing PFERD itself | ||||||
|  | pip install --editable . | ||||||
|  |  | ||||||
|  | # Installing tools and type hints | ||||||
|  | pip install --upgrade mypy flake8 flake8-pyproject autopep8 isort pyinstaller | ||||||
|  | pip install --upgrade types-chardet types-certifi | ||||||
							
								
								
									
										15
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								setup.py
									
									
									
									
									
								
							| @@ -1,15 +0,0 @@ | |||||||
| from setuptools import setup |  | ||||||
|  |  | ||||||
| setup( |  | ||||||
|         name="PFERD", |  | ||||||
|         version="1.1.1", |  | ||||||
|         packages=["PFERD"], |  | ||||||
|         install_requires=[ |  | ||||||
|             "requests>=2.21.0", |  | ||||||
|             "beautifulsoup4>=4.7.1", |  | ||||||
|         ], |  | ||||||
| ) |  | ||||||
|  |  | ||||||
| # When updating the version, also: |  | ||||||
| # - update the README.md installation instructions |  | ||||||
| # - set a tag on the update commit |  | ||||||
		Reference in New Issue
	
	Block a user