mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-22 09:42:31 +02:00 
			
		
		
		
	Compare commits
	
		
			74 Commits
		
	
	
		
			debug/mtim
			...
			fix/exerci
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | dd2fedf1a2 | ||
|   | 77a23265a9 | ||
|   | 4c230ef6dd | ||
|   | b305e1ce23 | ||
|   | bdf17f5c87 | ||
|   | 77fce7daf8 | ||
|   | 653bf139f0 | ||
|   | 3f60638d33 | ||
|   | b97b6fae6b | ||
|   | 477234ad0d | ||
|   | 63f25277b0 | ||
|   | c8eff04ae0 | ||
|   | edc482cdf4 | ||
|   | 72cd0f77e2 | ||
|   | be175f9347 | ||
|   | ba2833dba5 | ||
|   | 2f0e792670 | ||
|   | 5f88539f7e | ||
|   | bd9d7efe64 | ||
|   | 16a2dd5b15 | ||
|   | 678283d341 | ||
|   | 287173b0b1 | ||
|   | 712217e959 | ||
|   | 6dda4c55a8 | ||
|   | 596b6a7688 | ||
|   | 5983200247 | ||
|   | 26e802d88b | ||
|   | f5c4e82816 | ||
|   | f5273f7ca0 | ||
|   | fa71a9f44f | ||
|   | 81d6ff53c4 | ||
|   | d7a2b6e019 | ||
|   | 71c65e89d1 | ||
|   | c1046498e7 | ||
|   | 8fbd1978af | ||
|   | 739dd95850 | ||
|   | c54c3bcfa1 | ||
|   | d7f2229978 | ||
|   | 52fdeae752 | ||
|   | f9bb2e41cf | ||
|   | 4f9e2ab48d | ||
|   | 19beb8f07b | ||
|   | c897d9e2f5 | ||
|   | 21a266e302 | ||
|   | b29b6f93f8 | ||
|   | 318226d7cb | ||
|   | 422cf05f15 | ||
|   | 819c6673c7 | ||
|   | 89b44c69a7 | ||
|   | 4b4f72b2ca | ||
|   | 778517d8c6 | ||
|   | 428b0179fc | ||
|   | ade6309dd9 | ||
|   | fd6cb7b966 | ||
|   | 5c87517ceb | ||
|   | b01f093474 | ||
|   | 3a05b90525 | ||
|   | 7a00f73e0e | ||
|   | 5d0621420e | ||
|   | df98153169 | ||
|   | fc1f68ccd9 | ||
|   | 3e831c7e23 | ||
|   | bbcfe9c8dd | ||
|   | eb01aa86cb | ||
|   | 3db186a978 | ||
|   | 4a5959fd58 | ||
|   | 1cbc2b717a | ||
|   | da627ff929 | ||
|   | c1b592ac29 | ||
|   | eb0c956d32 | ||
|   | ab0cb2d956 | ||
|   | a117126389 | ||
|   | e9f8901520 | ||
|   | 266812f90e | 
							
								
								
									
										10
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| version: 2 | ||||
| updates: | ||||
|   - package-ecosystem: github-actions | ||||
|     directory: / | ||||
|     schedule: | ||||
|       interval: monthly | ||||
|     groups: | ||||
|       gh-actions: | ||||
|         patterns: | ||||
|           - "*" | ||||
							
								
								
									
										31
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										31
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
								
							| @@ -1,6 +1,6 @@ | ||||
| name: build-and-release | ||||
|  | ||||
| on: push | ||||
| on: [push, pull_request] | ||||
|  | ||||
| defaults: | ||||
|   run: | ||||
| @@ -13,13 +13,12 @@ jobs: | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|         python: ["3.9"] | ||||
|         os: [ubuntu-latest, windows-latest, macos-13, macos-latest] | ||||
|         python: ["3.11"] | ||||
|     steps: | ||||
|       - uses: actions/checkout@v4 | ||||
|  | ||||
|       - uses: actions/checkout@v3 | ||||
|  | ||||
|       - uses: actions/setup-python@v4 | ||||
|       - uses: actions/setup-python@v5 | ||||
|         with: | ||||
|           python-version: ${{ matrix.python }} | ||||
|  | ||||
| @@ -34,7 +33,12 @@ jobs: | ||||
|         run: ./scripts/setup --no-pip | ||||
|  | ||||
|       - name: Run checks | ||||
|         run: ./scripts/check | ||||
|         run: | | ||||
|           ./scripts/check | ||||
|           ./scripts/format | ||||
|  | ||||
|       - name: Assert no changes | ||||
|         run: git diff --exit-code | ||||
|  | ||||
|       - name: Build | ||||
|         run: ./scripts/build | ||||
| @@ -45,9 +49,9 @@ jobs: | ||||
|         run: mv dist/pferd* dist/pferd-${{ matrix.os }} | ||||
|  | ||||
|       - name: Upload binary | ||||
|         uses: actions/upload-artifact@v3 | ||||
|         uses: actions/upload-artifact@v4 | ||||
|         with: | ||||
|           name: Binaries | ||||
|           name: pferd-${{ matrix.os }} | ||||
|           path: dist/pferd-${{ matrix.os }} | ||||
|  | ||||
|   release: | ||||
| @@ -57,18 +61,20 @@ jobs: | ||||
|     steps: | ||||
|  | ||||
|       - name: Download binaries | ||||
|         uses: actions/download-artifact@v3 | ||||
|         uses: actions/download-artifact@v4 | ||||
|         with: | ||||
|           name: Binaries | ||||
|           pattern: pferd-* | ||||
|           merge-multiple: true | ||||
|  | ||||
|       - name: Rename binaries | ||||
|         run: | | ||||
|           mv pferd-ubuntu-latest pferd-linux | ||||
|           mv pferd-windows-latest pferd-windows.exe | ||||
|           mv pferd-macos-13 pferd-mac-x86_64 | ||||
|           mv pferd-macos-latest pferd-mac | ||||
|  | ||||
|       - name: Create release | ||||
|         uses: softprops/action-gh-release@v1 | ||||
|         uses: softprops/action-gh-release@v2 | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|         with: | ||||
| @@ -76,3 +82,4 @@ jobs: | ||||
|             pferd-linux | ||||
|             pferd-windows.exe | ||||
|             pferd-mac | ||||
|             pferd-mac-x86_64 | ||||
|   | ||||
							
								
								
									
										82
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										82
									
								
								CHANGELOG.md
									
									
									
									
									
								
							| @@ -22,6 +22,88 @@ ambiguous situations. | ||||
|  | ||||
| ## Unreleased | ||||
|  | ||||
| ## Fixed | ||||
| - Crawling of exercises with instructions | ||||
|  | ||||
| ## 3.8.2 - 2025-04-29 | ||||
|  | ||||
| ## Changed | ||||
| - Explicitly mention that wikis are not supported at the moment and ignore them | ||||
|  | ||||
| ## Fixed | ||||
| - Ilias-native login | ||||
| - Exercise crawling | ||||
|  | ||||
| ## 3.8.1 - 2025-04-17 | ||||
|  | ||||
| ## Fixed | ||||
| - Description html files now specify at UTF-8 encoding | ||||
| - Images in descriptions now always have a white background | ||||
|  | ||||
| ## 3.8.0 - 2025-04-16 | ||||
|  | ||||
| ### Added | ||||
| - Support for ILIAS 9 | ||||
|  | ||||
| ### Changed | ||||
| - Added prettier CSS to forum threads | ||||
| - Downloaded forum threads now link to the forum instead of the ILIAS thread | ||||
| - Increase minimum supported Python version to 3.11 | ||||
| - Do not crawl nested courses (courses linked in other courses) | ||||
|  | ||||
| ## Fixed | ||||
| - File links in report on Windows | ||||
| - TOTP authentication in KIT Shibboleth | ||||
| - Forum crawling only considering the first 20 entries | ||||
|  | ||||
| ## 3.7.0 - 2024-11-13 | ||||
|  | ||||
| ### Added | ||||
| - Support for MOB videos in page descriptions | ||||
| - Clickable links in the report to directly open new/modified/not-deleted files | ||||
| - Support for non KIT shibboleth login | ||||
|  | ||||
| ### Changed | ||||
| - Remove videos from description pages | ||||
| - Perform ILIAS cycle detection after processing the transform to allow | ||||
|   ignoring duplicated elements | ||||
| - Parse headings (h1-h3) as folders in kit-ipd crawler | ||||
|  | ||||
| ### Fixed | ||||
| - Personal desktop/dashboard/favorites crawling | ||||
| - Crawling of nested courses | ||||
| - Downloading of links with no target URL | ||||
| - Handle row flex on description pages | ||||
| - Add `<!DOCTYPE html>` heading to forum threads to fix mime type detection | ||||
| - Handle groups in cards | ||||
|  | ||||
| ## 3.6.0 - 2024-10-23 | ||||
|  | ||||
| ### Added | ||||
| - Generic `ilias-web` crawler and `ilias-web` CLI command | ||||
| - Support for the course overview page. Using this URL as a target might cause | ||||
|   duplication warnings, as subgroups are listed separately. | ||||
| - Support for named capture groups in regex transforms | ||||
| - Crawl custom item groups as folders | ||||
|  | ||||
| ### Fixed | ||||
| - Normalization of meeting names in cards | ||||
| - Sanitization of slashes in exercise container names | ||||
|  | ||||
| ## 3.5.2 - 2024-04-14 | ||||
|  | ||||
| ### Fixed | ||||
| - Crawling of personal desktop with ILIAS 8 | ||||
| - Crawling of empty personal desktops | ||||
|  | ||||
| ## 3.5.1 - 2024-04-09 | ||||
|  | ||||
| ### Added | ||||
| - Support for ILIAS 8 | ||||
|  | ||||
| ### Fixed | ||||
| - Video name deduplication | ||||
|  | ||||
| ## 3.5.0 - 2023-09-13 | ||||
|  | ||||
| ### Added | ||||
|   | ||||
							
								
								
									
										84
									
								
								CONFIG.md
									
									
									
									
									
								
							
							
						
						
									
										84
									
								
								CONFIG.md
									
									
									
									
									
								
							| @@ -4,11 +4,11 @@ A config file consists of sections. A section begins with a `[section]` header, | ||||
| which is followed by a list of `key = value` pairs. Comments must be on their | ||||
| own line and start with `#`. Multiline values must be indented beyond their key. | ||||
| Boolean values can be `yes` or `no`. For more details and some examples on the | ||||
| format, see the [configparser documentation][1] ([interpolation][2] is | ||||
| disabled). | ||||
| format, see the [configparser documentation][cp-file] | ||||
| ([interpolation][cp-interp] is disabled). | ||||
|  | ||||
| [1]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure" | ||||
| [2]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values" | ||||
| [cp-file]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure" | ||||
| [cp-interp]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values" | ||||
|  | ||||
| ## The `DEFAULT` section | ||||
|  | ||||
| @@ -146,7 +146,7 @@ crawler simulate a slower, network-based crawler. | ||||
|  | ||||
| This crawler crawls a KIT-IPD page by url. The root page can be crawled from | ||||
| outside the KIT network so you will be informed about any new/deleted files, | ||||
| but downloading files requires you to be within. Adding a show delay between | ||||
| but downloading files requires you to be within. Adding a short delay between | ||||
| requests is likely a good idea. | ||||
|  | ||||
| - `target`: URL to a KIT-IPD page | ||||
| @@ -154,6 +154,63 @@ requests is likely a good idea. | ||||
|   matches, the given link is downloaded as a file. This is used to extract | ||||
|   files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) | ||||
|  | ||||
| ### The `ilias-web` crawler | ||||
|  | ||||
| This crawler crawls a generic ILIAS instance. | ||||
|  | ||||
| Inspired by [this ILIAS downloader][ilias-dl], the following configurations should work | ||||
| out of the box for the corresponding universities: | ||||
|  | ||||
| [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" | ||||
|  | ||||
| | University    | `base_url`                              | `login_type` | `client_id`   | | ||||
| |---------------|-----------------------------------------|--------------|---------------| | ||||
| | FH Aachen     | https://www.ili.fh-aachen.de            | local        | elearning     | | ||||
| | Uni Köln      | https://www.ilias.uni-koeln.de/ilias    | local        | uk            | | ||||
| | Uni Konstanz  | https://ilias.uni-konstanz.de           | local        | ILIASKONSTANZ | | ||||
| | Uni Stuttgart | https://ilias3.uni-stuttgart.de         | local        | Uni_Stuttgart | | ||||
| | Uni Tübingen  | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth   |               | | ||||
|  | ||||
| If your university isn't listed, try navigating to your instance's login page. | ||||
| Assuming no custom login service is used, the URL will look something like this: | ||||
|  | ||||
| ```jinja | ||||
| {{ base_url }}/login.php?client_id={{ client_id }}&cmd=force_login&lang= | ||||
| ``` | ||||
|  | ||||
| If the values work, feel free to submit a PR and add them to the table above. | ||||
|  | ||||
| - `base_url`: The URL where the ILIAS instance is located. (Required) | ||||
| - `login_type`: How you authenticate. (Required) | ||||
|     - `local`: Use `client_id` for authentication. | ||||
|     - `shibboleth`: Use shibboleth for authentication. | ||||
| - `client_id`: An ID used for authentication if `login_type` is `local`. Is | ||||
|   ignored if `login_type` is `shibboleth`. | ||||
| - `target`: The ILIAS element to crawl. (Required) | ||||
|     - `desktop`: Crawl your personal desktop / dashboard | ||||
|     - `<course id>`: Crawl the course with the given id | ||||
|     - `<url>`: Crawl a given element by URL (preferably the permanent URL linked | ||||
|       at the bottom of its ILIAS page).   | ||||
|       This also supports the "My Courses" overview page to download *all* | ||||
|       courses. Note that this might produce confusing local directory layouts | ||||
|       and duplication warnings if you are a member of an ILIAS group. The | ||||
|       `desktop` target is generally preferable. | ||||
| - `auth`: Name of auth section to use for login. (Required) | ||||
| - `tfa_auth`: Name of auth section to use for two-factor authentication. Only | ||||
|   uses the auth section's password. (Default: Anonymous `tfa` authenticator) | ||||
| - `links`: How to represent external links. (Default: `fancy`) | ||||
|     - `ignore`: Don't download links. | ||||
|     - `plaintext`: A text file containing only the URL. | ||||
|     - `fancy`: A HTML file looking like the ILIAS link element. | ||||
|     - `internet-shortcut`: An internet shortcut file (`.url` file). | ||||
| - `link_redirect_delay`: Time (in seconds) until `fancy` link files will | ||||
|   redirect to the actual URL. Set to a negative value to disable the automatic | ||||
|   redirect. (Default: `-1`) | ||||
| - `videos`: Whether to download videos. (Default: `no`) | ||||
| - `forums`: Whether to download forum threads. (Default: `no`) | ||||
| - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||
|   `20.0`) | ||||
|  | ||||
| ### The `kit-ilias-web` crawler | ||||
|  | ||||
| This crawler crawls the KIT ILIAS instance. | ||||
| @@ -232,10 +289,10 @@ is stored in the keyring. | ||||
|  | ||||
| ### The `pass` authenticator | ||||
|  | ||||
| This authenticator queries the [`pass` password manager][3] for a username and | ||||
| password. It tries to be mostly compatible with [browserpass][4] and | ||||
| [passff][5], so see those links for an overview of the format. If PFERD fails | ||||
| to load your password, you can use the `--explain` flag to see why. | ||||
| This authenticator queries the [`pass` password manager][pass] for a username | ||||
| and password. It tries to be mostly compatible with [browserpass][browserpass] | ||||
| and [passff][passff], so see those links for an overview of the format. If PFERD | ||||
| fails to load your password, you can use the `--explain` flag to see why. | ||||
|  | ||||
| - `passname`: The name of the password to use (Required) | ||||
| - `username_prefixes`: A comma-separated list of username line prefixes | ||||
| @@ -243,9 +300,9 @@ to load your password, you can use the `--explain` flag to see why. | ||||
| - `password_prefixes`: A comma-separated list of password line prefixes | ||||
|   (Default: `password,pass,secret`) | ||||
|  | ||||
| [3]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager" | ||||
| [4]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store" | ||||
| [5]: <https://github.com/passff/passff#multi-line-format> "Multi-line format" | ||||
| [pass]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager" | ||||
| [browserpass]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store" | ||||
| [passff]: <https://github.com/passff/passff#multi-line-format> "Multi-line format" | ||||
|  | ||||
| ### The `tfa` authenticator | ||||
|  | ||||
| @@ -344,7 +401,8 @@ matches `SOURCE`, the output path is created using `TARGET` as template. | ||||
| be referred to as `{g<n>}` (e.g. `{g3}`). `{g0}` refers to the original path. | ||||
| If capturing group *n*'s contents are a valid integer, the integer value is | ||||
| available as `{i<n>}` (e.g. `{i3}`). If capturing group *n*'s contents are a | ||||
| valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). If a | ||||
| valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). Named capture | ||||
| groups (e.g. `(?P<name>)`) are available by their name (e.g. `{name}`). If a | ||||
| capturing group is not present (e.g. when matching the string `cd` with the | ||||
| regex `(ab)?cd`), the corresponding variables are not defined. | ||||
|  | ||||
|   | ||||
							
								
								
									
										4
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								LICENSE
									
									
									
									
									
								
							| @@ -1,6 +1,6 @@ | ||||
| Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||
| Copyright 2019-2024 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||
|                     TheChristophe, Scriptim, thelukasprobst, Toorero, | ||||
|                     Mr-Pine | ||||
|                     Mr-Pine, p-fruck, PinieP | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
| this software and associated documentation files (the "Software"), to deal in | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| from typing import Optional, Tuple | ||||
| from typing import Optional, Tuple, cast | ||||
|  | ||||
| import keyring | ||||
|  | ||||
| @@ -13,7 +13,7 @@ class KeyringAuthSection(AuthSection): | ||||
|         return self.s.get("username") | ||||
|  | ||||
|     def keyring_name(self) -> str: | ||||
|         return self.s.get("keyring_name", fallback=NAME) | ||||
|         return cast(str, self.s.get("keyring_name", fallback=NAME)) | ||||
|  | ||||
|  | ||||
| class KeyringAuthenticator(Authenticator): | ||||
|   | ||||
| @@ -8,6 +8,7 @@ | ||||
| # well. | ||||
|  | ||||
| from . import command_local  # noqa: F401 imported but unused | ||||
| from . import command_ilias_web  # noqa: F401 imported but unused | ||||
| from . import command_kit_ilias_web  # noqa: F401 imported but unused | ||||
| from . import command_kit_ipd  # noqa: F401 imported but unused | ||||
| from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused | ||||
|   | ||||
							
								
								
									
										56
									
								
								PFERD/cli/command_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								PFERD/cli/command_ilias_web.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | ||||
| import argparse | ||||
| import configparser | ||||
|  | ||||
| from ..logging import log | ||||
| from .common_ilias_args import configure_common_group_args, load_common | ||||
| from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||
|  | ||||
| COMMAND_NAME = "ilias-web" | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     COMMAND_NAME, | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title=f"{COMMAND_NAME} crawler arguments", | ||||
|     description=f"arguments for the '{COMMAND_NAME}' crawler", | ||||
| ) | ||||
|  | ||||
| GROUP.add_argument( | ||||
|     "--base-url", | ||||
|     type=str, | ||||
|     metavar="BASE_URL", | ||||
|     help="The base url of the ilias instance" | ||||
| ) | ||||
|  | ||||
| GROUP.add_argument( | ||||
|     "--client-id", | ||||
|     type=str, | ||||
|     metavar="CLIENT_ID", | ||||
|     help="The client id of the ilias instance" | ||||
| ) | ||||
|  | ||||
| configure_common_group_args(GROUP) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain(f"Creating config for command '{COMMAND_NAME}'") | ||||
|  | ||||
|     parser["crawl:ilias"] = {} | ||||
|     section = parser["crawl:ilias"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = COMMAND_NAME | ||||
|     if args.ilias_url is not None: | ||||
|         section["base_url"] = args.ilias_url | ||||
|     if args.client_id is not None: | ||||
|         section["client_id"] = args.client_id | ||||
|  | ||||
|     load_common(section, args, parser) | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
| @@ -1,120 +1,37 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..crawl.ilias.file_templates import Links | ||||
| from ..logging import log | ||||
| from .parser import (CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, ParserLoadError, load_crawler, | ||||
|                      show_value_error) | ||||
| from .common_ilias_args import configure_common_group_args, load_common | ||||
| from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler | ||||
|  | ||||
| COMMAND_NAME = "kit-ilias-web" | ||||
|  | ||||
| SUBPARSER = SUBPARSERS.add_parser( | ||||
|     "kit-ilias-web", | ||||
|     COMMAND_NAME, | ||||
|     parents=[CRAWLER_PARSER], | ||||
| ) | ||||
|  | ||||
| GROUP = SUBPARSER.add_argument_group( | ||||
|     title="kit-ilias-web crawler arguments", | ||||
|     description="arguments for the 'kit-ilias-web' crawler", | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "target", | ||||
|     type=str, | ||||
|     metavar="TARGET", | ||||
|     help="course id, 'desktop', or ILIAS URL to crawl" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "output", | ||||
|     type=Path, | ||||
|     metavar="OUTPUT", | ||||
|     help="output directory" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--username", "-u", | ||||
|     type=str, | ||||
|     metavar="USERNAME", | ||||
|     help="user name for authentication" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--keyring", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="use the system keyring to store and retrieve passwords" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--credential-file", | ||||
|     type=Path, | ||||
|     metavar="PATH", | ||||
|     help="read username and password from a credential file" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--links", | ||||
|     type=show_value_error(Links.from_string), | ||||
|     metavar="OPTION", | ||||
|     help="how to represent external links" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--link-redirect-delay", | ||||
|     type=int, | ||||
|     metavar="SECONDS", | ||||
|     help="time before 'fancy' links redirect to to their target (-1 to disable)" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--videos", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="crawl and download videos" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--forums", | ||||
|     action=BooleanOptionalAction, | ||||
|     help="crawl and download forum posts" | ||||
| ) | ||||
| GROUP.add_argument( | ||||
|     "--http-timeout", "-t", | ||||
|     type=float, | ||||
|     metavar="SECONDS", | ||||
|     help="timeout for all HTTP requests" | ||||
|     title=f"{COMMAND_NAME} crawler arguments", | ||||
|     description=f"arguments for the '{COMMAND_NAME}' crawler", | ||||
| ) | ||||
|  | ||||
| configure_common_group_args(GROUP) | ||||
|  | ||||
|  | ||||
| def load( | ||||
|         args: argparse.Namespace, | ||||
|         parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     log.explain("Creating config for command 'kit-ilias-web'") | ||||
|     log.explain(f"Creating config for command '{COMMAND_NAME}'") | ||||
|  | ||||
|     parser["crawl:ilias"] = {} | ||||
|     section = parser["crawl:ilias"] | ||||
|     load_crawler(args, section) | ||||
|  | ||||
|     section["type"] = "kit-ilias-web" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     section["auth"] = "auth:ilias" | ||||
|     if args.links is not None: | ||||
|         section["links"] = str(args.links.value) | ||||
|     if args.link_redirect_delay is not None: | ||||
|         section["link_redirect_delay"] = str(args.link_redirect_delay) | ||||
|     if args.videos is not None: | ||||
|         section["videos"] = "yes" if args.videos else "no" | ||||
|     if args.forums is not None: | ||||
|         section["forums"] = "yes" if args.forums else "no" | ||||
|     if args.http_timeout is not None: | ||||
|         section["http_timeout"] = str(args.http_timeout) | ||||
|  | ||||
|     parser["auth:ilias"] = {} | ||||
|     auth_section = parser["auth:ilias"] | ||||
|     if args.credential_file is not None: | ||||
|         if args.username is not None: | ||||
|             raise ParserLoadError("--credential-file and --username can't be used together") | ||||
|         if args.keyring: | ||||
|             raise ParserLoadError("--credential-file and --keyring can't be used together") | ||||
|         auth_section["type"] = "credential-file" | ||||
|         auth_section["path"] = str(args.credential_file) | ||||
|     elif args.keyring: | ||||
|         auth_section["type"] = "keyring" | ||||
|     else: | ||||
|         auth_section["type"] = "simple" | ||||
|     if args.username is not None: | ||||
|         auth_section["username"] = args.username | ||||
|     section["type"] = COMMAND_NAME | ||||
|     load_common(section, args, parser) | ||||
|  | ||||
|  | ||||
| SUBPARSER.set_defaults(command=load) | ||||
|   | ||||
							
								
								
									
										104
									
								
								PFERD/cli/common_ilias_args.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								PFERD/cli/common_ilias_args.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | ||||
| import argparse | ||||
| import configparser | ||||
| from pathlib import Path | ||||
|  | ||||
| from ..crawl.ilias.file_templates import Links | ||||
| from .parser import BooleanOptionalAction, ParserLoadError, show_value_error | ||||
|  | ||||
|  | ||||
| def configure_common_group_args(group: argparse._ArgumentGroup) -> None: | ||||
|     """These arguments are shared between the KIT and generic Ilias web command.""" | ||||
|     group.add_argument( | ||||
|         "target", | ||||
|         type=str, | ||||
|         metavar="TARGET", | ||||
|         help="course id, 'desktop', or ILIAS URL to crawl" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "output", | ||||
|         type=Path, | ||||
|         metavar="OUTPUT", | ||||
|         help="output directory" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--username", "-u", | ||||
|         type=str, | ||||
|         metavar="USERNAME", | ||||
|         help="user name for authentication" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--keyring", | ||||
|         action=BooleanOptionalAction, | ||||
|         help="use the system keyring to store and retrieve passwords" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--credential-file", | ||||
|         type=Path, | ||||
|         metavar="PATH", | ||||
|         help="read username and password from a credential file" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--links", | ||||
|         type=show_value_error(Links.from_string), | ||||
|         metavar="OPTION", | ||||
|         help="how to represent external links" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--link-redirect-delay", | ||||
|         type=int, | ||||
|         metavar="SECONDS", | ||||
|         help="time before 'fancy' links redirect to to their target (-1 to disable)" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--videos", | ||||
|         action=BooleanOptionalAction, | ||||
|         help="crawl and download videos" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--forums", | ||||
|         action=BooleanOptionalAction, | ||||
|         help="crawl and download forum posts" | ||||
|     ) | ||||
|     group.add_argument( | ||||
|         "--http-timeout", "-t", | ||||
|         type=float, | ||||
|         metavar="SECONDS", | ||||
|         help="timeout for all HTTP requests" | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def load_common( | ||||
|     section: configparser.SectionProxy, | ||||
|     args: argparse.Namespace, | ||||
|     parser: configparser.ConfigParser, | ||||
| ) -> None: | ||||
|     """Load common config between generic and KIT ilias web command""" | ||||
|     section["target"] = str(args.target) | ||||
|     section["output_dir"] = str(args.output) | ||||
|     section["auth"] = "auth:ilias" | ||||
|     if args.links is not None: | ||||
|         section["links"] = str(args.links.value) | ||||
|     if args.link_redirect_delay is not None: | ||||
|         section["link_redirect_delay"] = str(args.link_redirect_delay) | ||||
|     if args.videos is not None: | ||||
|         section["videos"] = "yes" if args.videos else "no" | ||||
|     if args.forums is not None: | ||||
|         section["forums"] = "yes" if args.forums else "no" | ||||
|     if args.http_timeout is not None: | ||||
|         section["http_timeout"] = str(args.http_timeout) | ||||
|  | ||||
|     parser["auth:ilias"] = {} | ||||
|     auth_section = parser["auth:ilias"] | ||||
|     if args.credential_file is not None: | ||||
|         if args.username is not None: | ||||
|             raise ParserLoadError("--credential-file and --username can't be used together") | ||||
|         if args.keyring: | ||||
|             raise ParserLoadError("--credential-file and --keyring can't be used together") | ||||
|         auth_section["type"] = "credential-file" | ||||
|         auth_section["path"] = str(args.credential_file) | ||||
|     elif args.keyring: | ||||
|         auth_section["type"] = "keyring" | ||||
|     else: | ||||
|         auth_section["type"] = "simple" | ||||
|     if args.username is not None: | ||||
|         auth_section["username"] = args.username | ||||
| @@ -4,7 +4,7 @@ from typing import Callable, Dict | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config | ||||
| from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401 | ||||
| from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
| from .ilias import IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
| from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection | ||||
| from .local_crawler import LocalCrawler, LocalCrawlerSection | ||||
|  | ||||
| @@ -18,6 +18,8 @@ CrawlerConstructor = Callable[[ | ||||
| CRAWLERS: Dict[str, CrawlerConstructor] = { | ||||
|     "local": lambda n, s, c, a: | ||||
|         LocalCrawler(n, LocalCrawlerSection(s), c), | ||||
|     "ilias-web": lambda n, s, c, a: | ||||
|         IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a), | ||||
|     "kit-ilias-web": lambda n, s, c, a: | ||||
|         KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), | ||||
|     "kit-ipd": lambda n, s, c, a: | ||||
|   | ||||
| @@ -149,9 +149,7 @@ class CrawlerSection(Section): | ||||
|         return self.s.getboolean("skip", fallback=False) | ||||
|  | ||||
|     def output_dir(self, name: str) -> Path: | ||||
|         # TODO Use removeprefix() after switching to 3.9 | ||||
|         if name.startswith("crawl:"): | ||||
|             name = name[len("crawl:"):] | ||||
|         name = name.removeprefix("crawl:") | ||||
|         return Path(self.s.get("output_dir", name)).expanduser() | ||||
|  | ||||
|     def redownload(self) -> Redownload: | ||||
| @@ -258,6 +256,10 @@ class Crawler(ABC): | ||||
|     def prev_report(self) -> Optional[Report]: | ||||
|         return self._output_dir.prev_report | ||||
|  | ||||
|     @property | ||||
|     def output_dir(self) -> OutputDirectory: | ||||
|         return self._output_dir | ||||
|  | ||||
|     @staticmethod | ||||
|     async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: | ||||
|         """ | ||||
| @@ -290,9 +292,40 @@ class Crawler(ABC): | ||||
|         log.explain("Answer: Yes") | ||||
|         return CrawlToken(self._limiter, path) | ||||
|  | ||||
|     def should_try_download( | ||||
|             self, | ||||
|             path: PurePath, | ||||
|             *, | ||||
|             etag_differs: Optional[bool] = None, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
|     ) -> bool: | ||||
|         log.explain_topic(f"Decision: Should Download {fmt_path(path)}") | ||||
|  | ||||
|         if self._transformer.transform(path) is None: | ||||
|             log.explain("Answer: No (ignored)") | ||||
|             return False | ||||
|  | ||||
|         should_download = self._output_dir.should_try_download( | ||||
|             path, | ||||
|             etag_differs=etag_differs, | ||||
|             mtime=mtime, | ||||
|             redownload=redownload, | ||||
|             on_conflict=on_conflict | ||||
|         ) | ||||
|         if should_download: | ||||
|             log.explain("Answer: Yes") | ||||
|             return True | ||||
|         else: | ||||
|             log.explain("Answer: No") | ||||
|             return False | ||||
|  | ||||
|     async def download( | ||||
|             self, | ||||
|             path: PurePath, | ||||
|             *, | ||||
|             etag_differs: Optional[bool] = None, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
| @@ -307,7 +340,14 @@ class Crawler(ABC): | ||||
|             log.status("[bold bright_black]", "Ignored", fmt_path(path)) | ||||
|             return None | ||||
|  | ||||
|         fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) | ||||
|         fs_token = await self._output_dir.download( | ||||
|             path, | ||||
|             transformed_path, | ||||
|             etag_differs=etag_differs, | ||||
|             mtime=mtime, | ||||
|             redownload=redownload, | ||||
|             on_conflict=on_conflict | ||||
|         ) | ||||
|         if fs_token is None: | ||||
|             log.explain("Answer: No") | ||||
|             return None | ||||
|   | ||||
| @@ -1,12 +1,14 @@ | ||||
| import asyncio | ||||
| import http.cookies | ||||
| import ssl | ||||
| from datetime import datetime | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Any, Dict, List, Optional | ||||
| from typing import Any, Dict, List, Optional, Tuple, cast | ||||
|  | ||||
| import aiohttp | ||||
| import certifi | ||||
| from aiohttp.client import ClientTimeout | ||||
| from bs4 import Tag | ||||
|  | ||||
| from ..auth import Authenticator | ||||
| from ..config import Config | ||||
| @@ -15,10 +17,12 @@ from ..utils import fmt_real_path | ||||
| from ..version import NAME, VERSION | ||||
| from .crawler import Crawler, CrawlerSection | ||||
|  | ||||
| ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" | ||||
|  | ||||
|  | ||||
| class HttpCrawlerSection(CrawlerSection): | ||||
|     def http_timeout(self) -> float: | ||||
|         return self.s.getfloat("http_timeout", fallback=20) | ||||
|         return self.s.getfloat("http_timeout", fallback=30) | ||||
|  | ||||
|  | ||||
| class HttpCrawler(Crawler): | ||||
| @@ -169,6 +173,79 @@ class HttpCrawler(Crawler): | ||||
|             log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") | ||||
|             log.warn(str(e)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_folder_structure_from_heading_hierarchy(file_link: Tag, drop_h1: bool = False) -> PurePath: | ||||
|         """ | ||||
|         Retrieves the hierarchy of headings associated with the give file link and constructs a folder | ||||
|         structure from them. | ||||
|  | ||||
|         <h1> level headings usually only appear once and serve as the page title, so they would introduce | ||||
|         redundant nesting. To avoid this, <h1> headings are ignored via the drop_h1 parameter. | ||||
|         """ | ||||
|  | ||||
|         def find_associated_headings(tag: Tag, level: int) -> PurePath: | ||||
|             if level == 0 or (level == 1 and drop_h1): | ||||
|                 return PurePath() | ||||
|  | ||||
|             level_heading = cast(Optional[Tag], tag.find_previous(name=f"h{level}")) | ||||
|  | ||||
|             if level_heading is None: | ||||
|                 return find_associated_headings(tag, level - 1) | ||||
|  | ||||
|             folder_name = level_heading.get_text().strip() | ||||
|             return find_associated_headings(level_heading, level - 1) / folder_name | ||||
|  | ||||
|         # start at level <h3> because paragraph-level headings are usually too granular for folder names | ||||
|         return find_associated_headings(file_link, 3) | ||||
|  | ||||
|     def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: | ||||
|         """ | ||||
|         If available, retrieves the entity tag for a given path which was stored in the previous report. | ||||
|         """ | ||||
|         if not self._output_dir.prev_report: | ||||
|             return None | ||||
|  | ||||
|         etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} | ||||
|         return etags.get(str(path)) | ||||
|  | ||||
|     def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: | ||||
|         """ | ||||
|         Adds an entity tag for a given path to the report's custom values. | ||||
|         """ | ||||
|         if not etag: | ||||
|             return | ||||
|  | ||||
|         etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} | ||||
|         etags[str(path)] = etag | ||||
|         self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) | ||||
|  | ||||
|     async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]: | ||||
|         """ | ||||
|         Requests the ETag and Last-Modified headers of a resource via a HEAD request. | ||||
|         If no entity tag / modification date can be obtained, the according value will be None. | ||||
|         """ | ||||
|         try: | ||||
|             async with self.session.head(resource_url) as resp: | ||||
|                 if resp.status != 200: | ||||
|                     return None, None | ||||
|  | ||||
|                 etag_header = resp.headers.get("ETag") | ||||
|                 last_modified_header = resp.headers.get("Last-Modified") | ||||
|                 last_modified = None | ||||
|  | ||||
|                 if last_modified_header: | ||||
|                     try: | ||||
|                         # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives | ||||
|                         datetime_format = "%a, %d %b %Y %H:%M:%S GMT" | ||||
|                         last_modified = datetime.strptime(last_modified_header, datetime_format) | ||||
|                     except ValueError: | ||||
|                         # last_modified remains None | ||||
|                         pass | ||||
|  | ||||
|                 return etag_header, last_modified | ||||
|         except aiohttp.ClientError: | ||||
|             return None, None | ||||
|  | ||||
|     async def run(self) -> None: | ||||
|         self._request_count = 0 | ||||
|         self._cookie_jar = aiohttp.CookieJar() | ||||
| @@ -186,7 +263,12 @@ class HttpCrawler(Crawler): | ||||
|                     connect=self._http_timeout, | ||||
|                     sock_connect=self._http_timeout, | ||||
|                     sock_read=self._http_timeout, | ||||
|                 ) | ||||
|                 ), | ||||
|                 # See https://github.com/aio-libs/aiohttp/issues/6626 | ||||
|                 # Without this aiohttp will mangle the redirect header from Shibboleth, invalidating the | ||||
|                 # passed signature. Shibboleth will not accept the broken signature and authentication will | ||||
|                 # fail. | ||||
|                 requote_redirect_url=False | ||||
|         ) as session: | ||||
|             self.session = session | ||||
|             try: | ||||
|   | ||||
| @@ -1,3 +1,9 @@ | ||||
| from .kit_ilias_web_crawler import KitIliasWebCrawler, KitIliasWebCrawlerSection | ||||
| from .kit_ilias_web_crawler import (IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, | ||||
|                                     KitIliasWebCrawlerSection) | ||||
|  | ||||
| __all__ = ["KitIliasWebCrawler", "KitIliasWebCrawlerSection"] | ||||
| __all__ = [ | ||||
|     "IliasWebCrawler", | ||||
|     "IliasWebCrawlerSection", | ||||
|     "KitIliasWebCrawler", | ||||
|     "KitIliasWebCrawlerSection", | ||||
| ] | ||||
|   | ||||
							
								
								
									
										40
									
								
								PFERD/crawl/ilias/async_helper.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								PFERD/crawl/ilias/async_helper.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
| import asyncio | ||||
| from typing import Any, Callable, Optional | ||||
|  | ||||
| import aiohttp | ||||
|  | ||||
| from ...logging import log | ||||
| from ..crawler import AWrapped, CrawlError, CrawlWarning | ||||
|  | ||||
|  | ||||
| def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | ||||
|     def decorator(f: AWrapped) -> AWrapped: | ||||
|         async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: | ||||
|             last_exception: Optional[BaseException] = None | ||||
|             for round in range(attempts): | ||||
|                 try: | ||||
|                     return await f(*args, **kwargs) | ||||
|                 except aiohttp.ContentTypeError:  # invalid content type | ||||
|                     raise CrawlWarning("ILIAS returned an invalid content type") | ||||
|                 except aiohttp.TooManyRedirects: | ||||
|                     raise CrawlWarning("Got stuck in a redirect loop") | ||||
|                 except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes | ||||
|                     last_exception = e | ||||
|                 except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc. | ||||
|                     last_exception = e | ||||
|                 except asyncio.exceptions.TimeoutError as e:  # explicit http timeouts in HttpCrawler | ||||
|                     last_exception = e | ||||
|                 log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") | ||||
|                 log.explain(f"Last exception: {last_exception!r}") | ||||
|  | ||||
|             if last_exception: | ||||
|                 message = f"Error in I/O Operation: {last_exception!r}" | ||||
|                 if failure_is_error: | ||||
|                     raise CrawlError(message) from last_exception | ||||
|                 else: | ||||
|                     raise CrawlWarning(message) from last_exception | ||||
|             raise CrawlError("Impossible return in ilias _iorepeat") | ||||
|  | ||||
|         return wrapper  # type: ignore | ||||
|  | ||||
|     return decorator | ||||
| @@ -1,5 +1,5 @@ | ||||
| from enum import Enum | ||||
| from typing import Optional | ||||
| from typing import Optional, cast | ||||
|  | ||||
| import bs4 | ||||
|  | ||||
| @@ -126,6 +126,88 @@ _learning_module_template = """ | ||||
| </html> | ||||
| """ | ||||
|  | ||||
| _forum_thread_template = """ | ||||
| <!DOCTYPE html> | ||||
| <html lang="en"> | ||||
|     <head> | ||||
|         <meta charset="UTF-8"> | ||||
|         <title>ILIAS - Forum: {{name}}</title> | ||||
|         <style> | ||||
|             * { | ||||
|                 box-sizing: border-box; | ||||
|             } | ||||
|             body { | ||||
|                 font-family: 'Open Sans', Verdana, Arial, Helvetica, sans-serif; | ||||
|                 padding: 8px; | ||||
|             } | ||||
|             ul, ol, p { | ||||
|                 margin: 1.2em 0; | ||||
|             } | ||||
|             p { | ||||
|                 margin-top: 8px; | ||||
|                 margin-bottom: 8px; | ||||
|             } | ||||
|             a { | ||||
|                 color: #00876c; | ||||
|                 text-decoration: none; | ||||
|                 cursor: pointer; | ||||
|             } | ||||
|             a:hover { | ||||
|                 text-decoration: underline; | ||||
|             } | ||||
|             body > p:first-child > span:first-child { | ||||
|                 font-size: 1.6em; | ||||
|             } | ||||
|             body > p:first-child > span:first-child ~ span.default { | ||||
|                 display: inline-block; | ||||
|                 font-size: 1.2em; | ||||
|                 padding-bottom: 8px; | ||||
|             } | ||||
|             .ilFrmPostContent { | ||||
|                 margin-top: 8px; | ||||
|                 max-width: 64em; | ||||
|             } | ||||
|             .ilFrmPostContent > *:first-child { | ||||
|                 margin-top: 0px; | ||||
|             } | ||||
|             .ilFrmPostTitle { | ||||
|                 margin-top: 24px; | ||||
|                 color: #00876c; | ||||
|                 font-weight: bold; | ||||
|             } | ||||
|             #ilFrmPostList { | ||||
|                 list-style: none; | ||||
|                 padding-left: 0; | ||||
|             } | ||||
|             li.ilFrmPostRow { | ||||
|                 padding: 3px 0 3px 3px; | ||||
|                 margin-bottom: 24px; | ||||
|                 border-left: 6px solid #dddddd; | ||||
|             } | ||||
|             .ilFrmPostRow > div { | ||||
|                 display: flex; | ||||
|             } | ||||
|             .ilFrmPostImage img { | ||||
|                 margin: 0 !important; | ||||
|                 padding: 6px 9px 9px 6px; | ||||
|             } | ||||
|             .ilUserIcon { | ||||
|                 width: 115px; | ||||
|             } | ||||
|             .small { | ||||
|                 text-decoration: none; | ||||
|                 font-size: 0.75rem; | ||||
|                 color: #6f6f6f; | ||||
|             } | ||||
|         </style> | ||||
|     </head> | ||||
|     <body> | ||||
|     {{heading}} | ||||
|     {{content}} | ||||
|     </body> | ||||
| </html> | ||||
| """.strip()  # noqa: E501 line too long | ||||
|  | ||||
|  | ||||
| def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: | ||||
|     # Seems to be comments, ignore those. | ||||
| @@ -139,13 +221,13 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next | ||||
|         </div> | ||||
|     """ | ||||
|     if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): | ||||
|         text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() | ||||
|         text = cast(bs4.Tag, body.select_one(".ilc_page_lnav_LeftNavigation")).get_text().strip() | ||||
|         left = f'<a href="{prev}">{text}</a>' | ||||
|     else: | ||||
|         left = "<span></span>" | ||||
|  | ||||
|     if next and body.select_one(".ilc_page_rnav_RightNavigation"): | ||||
|         text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() | ||||
|         text = cast(bs4.Tag, body.select_one(".ilc_page_rnav_RightNavigation")).get_text().strip() | ||||
|         right = f'<a href="{next}">{text}</a>' | ||||
|     else: | ||||
|         right = "<span></span>" | ||||
| @@ -160,8 +242,17 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next | ||||
|             "{{left}}", left).replace("{{right}}", right).encode()) | ||||
|         ) | ||||
|  | ||||
|     body = body.prettify() | ||||
|     return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) | ||||
|     body_str = cast(str, body.prettify()) | ||||
|     return _learning_module_template.replace("{{body}}", body_str).replace("{{name}}", name) | ||||
|  | ||||
|  | ||||
| def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Tag) -> str: | ||||
|     if title := cast(Optional[bs4.Tag], heading.find(name="b")): | ||||
|         title.wrap(bs4.Tag(name="a", attrs={"href": url})) | ||||
|     return _forum_thread_template \ | ||||
|         .replace("{{name}}", name) \ | ||||
|         .replace("{{heading}}", cast(str, heading.prettify())) \ | ||||
|         .replace("{{content}}", cast(str, content.prettify())) | ||||
|  | ||||
|  | ||||
| class Links(Enum): | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| from typing import cast | ||||
|  | ||||
| from bs4 import BeautifulSoup, Comment, Tag | ||||
|  | ||||
| _STYLE_TAG_CONTENT = """ | ||||
| @@ -12,6 +14,13 @@ _STYLE_TAG_CONTENT = """ | ||||
|       font-weight: bold; | ||||
|     } | ||||
|  | ||||
|     .row-flex { | ||||
|       display: flex; | ||||
|     } | ||||
|     .row-flex-wrap { | ||||
|       flex-wrap: wrap; | ||||
|     } | ||||
|  | ||||
|     .accordion-head { | ||||
|       background-color: #f5f7fa; | ||||
|       padding: 0.5rem 0; | ||||
| @@ -30,6 +39,10 @@ _STYLE_TAG_CONTENT = """ | ||||
|       margin: 0.5rem 0; | ||||
|     } | ||||
|  | ||||
|     img { | ||||
|         background-color: white; | ||||
|     } | ||||
|  | ||||
|     body { | ||||
|       padding: 1em; | ||||
|       grid-template-columns: 1fr min(60rem, 90%) 1fr; | ||||
| @@ -47,12 +60,11 @@ _ARTICLE_WORTHY_CLASSES = [ | ||||
| def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|     head = soup.new_tag("head") | ||||
|     soup.insert(0, head) | ||||
|     # Force UTF-8 encoding | ||||
|     head.append(soup.new_tag("meta", charset="utf-8")) | ||||
|  | ||||
|     simplecss_link: Tag = soup.new_tag("link") | ||||
|     # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> | ||||
|     simplecss_link["rel"] = "stylesheet" | ||||
|     simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" | ||||
|     head.append(simplecss_link) | ||||
|     head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css")) | ||||
|  | ||||
|     # Basic style tags for compat | ||||
|     style: Tag = soup.new_tag("style") | ||||
| @@ -63,18 +75,18 @@ def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|  | ||||
|  | ||||
| def clean(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|     for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): | ||||
|     for block in cast(list[Tag], soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES)): | ||||
|         block.name = "article" | ||||
|  | ||||
|     for block in soup.find_all("h3"): | ||||
|     for block in cast(list[Tag], soup.find_all("h3")): | ||||
|         block.name = "div" | ||||
|  | ||||
|     for block in soup.find_all("h1"): | ||||
|     for block in cast(list[Tag], soup.find_all("h1")): | ||||
|         block.name = "h3" | ||||
|  | ||||
|     for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): | ||||
|     for block in cast(list[Tag], soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap")): | ||||
|         block.name = "h3" | ||||
|         block["class"] += ["accordion-head"] | ||||
|         block["class"] += ["accordion-head"]  # type: ignore | ||||
|  | ||||
|     for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): | ||||
|         children = list(dummy.children) | ||||
| @@ -85,7 +97,12 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: | ||||
|         if isinstance(type(children[0]), Comment): | ||||
|             dummy.decompose() | ||||
|  | ||||
|     for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): | ||||
|     # Delete video figures, as they can not be internalized anyway | ||||
|     for video in soup.select(".ilc_media_cont_MediaContainerHighlighted .ilPageVideo"): | ||||
|         if figure := video.find_parent("figure"): | ||||
|             figure.decompose() | ||||
|  | ||||
|     for hrule_imposter in cast(list[Tag], soup.find_all(class_="ilc_section_Separator")): | ||||
|         hrule_imposter.insert(0, soup.new_tag("hr")) | ||||
|  | ||||
|     return soup | ||||
|   | ||||
							
								
								
									
										1061
									
								
								PFERD/crawl/ilias/ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1061
									
								
								PFERD/crawl/ilias/ilias_web_crawler.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										129
									
								
								PFERD/crawl/ilias/shibboleth_login.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										129
									
								
								PFERD/crawl/ilias/shibboleth_login.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,129 @@ | ||||
| from typing import Any, Optional, cast | ||||
|  | ||||
| import aiohttp | ||||
| import yarl | ||||
| from bs4 import BeautifulSoup, Tag | ||||
|  | ||||
| from ...auth import Authenticator, TfaAuthenticator | ||||
| from ...logging import log | ||||
| from ...utils import soupify | ||||
| from ..crawler import CrawlError | ||||
|  | ||||
|  | ||||
| class ShibbolethLogin: | ||||
|     """ | ||||
|     Login via shibboleth system. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, ilias_url: str, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator] | ||||
|     ) -> None: | ||||
|         self._ilias_url = ilias_url | ||||
|         self._auth = authenticator | ||||
|         self._tfa_auth = tfa_authenticator | ||||
|  | ||||
|     async def login(self, sess: aiohttp.ClientSession) -> None: | ||||
|         """ | ||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login | ||||
|         cookies it receieves. | ||||
|  | ||||
|         This function should only be called whenever it is detected that you're | ||||
|         not logged in. The cookies obtained should be good for a few minutes, | ||||
|         maybe even an hour or two. | ||||
|         """ | ||||
|  | ||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in | ||||
|         # https://ilias.studium.kit.edu/login.php | ||||
|         url = f"{self._ilias_url}/shib_login.php" | ||||
|         async with sess.get(url) as response: | ||||
|             shib_url = response.url | ||||
|             if str(shib_url).startswith(self._ilias_url): | ||||
|                 log.explain( | ||||
|                     "ILIAS recognized our shib token and logged us in in the background, returning" | ||||
|                 ) | ||||
|                 return | ||||
|             soup: BeautifulSoup = soupify(await response.read()) | ||||
|  | ||||
|         # Attempt to login using credentials, if necessary | ||||
|         while not self._login_successful(soup): | ||||
|             # Searching the form here so that this fails before asking for | ||||
|             # credentials rather than after asking. | ||||
|             form = cast(Tag, soup.find("form", {"method": "post"})) | ||||
|             action = cast(str, form["action"]) | ||||
|  | ||||
|             # Equivalent: Enter credentials in | ||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|             url = str(shib_url.origin()) + action | ||||
|             username, password = await self._auth.credentials() | ||||
|             data = { | ||||
|                 "_eventId_proceed": "", | ||||
|                 "j_username": username, | ||||
|                 "j_password": password, | ||||
|                 "fudis_web_authn_assertion_input": "", | ||||
|             } | ||||
|             if csrf_token_input := form.find("input", {"name": "csrf_token"}): | ||||
|                 data["csrf_token"] = csrf_token_input["value"]  # type: ignore | ||||
|             soup = await _post(sess, url, data) | ||||
|  | ||||
|             if soup.find(id="attributeRelease"): | ||||
|                 raise CrawlError( | ||||
|                     "ILIAS Shibboleth entitlements changed! " | ||||
|                     "Please log in once in your browser and review them" | ||||
|                 ) | ||||
|  | ||||
|             if self._tfa_required(soup): | ||||
|                 soup = await self._authenticate_tfa(sess, soup, shib_url) | ||||
|  | ||||
|             if not self._login_successful(soup): | ||||
|                 self._auth.invalidate_credentials() | ||||
|  | ||||
|         # Equivalent: Being redirected via JS automatically | ||||
|         # (or clicking "Continue" if you have JS disabled) | ||||
|         relay_state = cast(Tag, soup.find("input", {"name": "RelayState"})) | ||||
|         saml_response = cast(Tag, soup.find("input", {"name": "SAMLResponse"})) | ||||
|         url = form = soup.find("form", {"method": "post"})["action"]  # type: ignore | ||||
|         data = {  # using the info obtained in the while loop above | ||||
|             "RelayState": cast(str, relay_state["value"]), | ||||
|             "SAMLResponse": cast(str, saml_response["value"]), | ||||
|         } | ||||
|         await sess.post(cast(str, url), data=data) | ||||
|  | ||||
|     async def _authenticate_tfa( | ||||
|         self, session: aiohttp.ClientSession, soup: BeautifulSoup, shib_url: yarl.URL | ||||
|     ) -> BeautifulSoup: | ||||
|         if not self._tfa_auth: | ||||
|             self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") | ||||
|  | ||||
|         tfa_token = await self._tfa_auth.password() | ||||
|  | ||||
|         # Searching the form here so that this fails before asking for | ||||
|         # credentials rather than after asking. | ||||
|         form = cast(Tag, soup.find("form", {"method": "post"})) | ||||
|         action = cast(str, form["action"]) | ||||
|  | ||||
|         # Equivalent: Enter token in | ||||
|         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO | ||||
|         url = str(shib_url.origin()) + action | ||||
|         username, password = await self._auth.credentials() | ||||
|         data = { | ||||
|             "_eventId_proceed": "", | ||||
|             "fudis_otp_input": tfa_token, | ||||
|         } | ||||
|         if csrf_token_input := form.find("input", {"name": "csrf_token"}): | ||||
|             data["csrf_token"] = csrf_token_input["value"]  # type: ignore | ||||
|         return await _post(session, url, data) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _login_successful(soup: BeautifulSoup) -> bool: | ||||
|         relay_state = soup.find("input", {"name": "RelayState"}) | ||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) | ||||
|         return relay_state is not None and saml_response is not None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _tfa_required(soup: BeautifulSoup) -> bool: | ||||
|         return soup.find(id="fudiscr-form") is not None | ||||
|  | ||||
|  | ||||
| async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: | ||||
|     async with session.post(url, data=data) as response: | ||||
|         return soupify(await response.read()) | ||||
| @@ -1,8 +1,9 @@ | ||||
| import os | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from datetime import datetime | ||||
| from pathlib import PurePath | ||||
| from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union | ||||
| from typing import Any, Awaitable, Generator, Iterable, List, Optional, Pattern, Tuple, Union, cast | ||||
| from urllib.parse import urljoin | ||||
|  | ||||
| from bs4 import BeautifulSoup, Tag | ||||
| @@ -31,24 +32,24 @@ class KitIpdCrawlerSection(HttpCrawlerSection): | ||||
|         return re.compile(regex) | ||||
|  | ||||
|  | ||||
| @dataclass(unsafe_hash=True) | ||||
| @dataclass | ||||
| class KitIpdFile: | ||||
|     name: str | ||||
|     url: str | ||||
|  | ||||
|     def explain(self) -> None: | ||||
|         log.explain(f"File {self.name!r} (href={self.url!r})") | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class KitIpdFolder: | ||||
|     name: str | ||||
|     files: List[KitIpdFile] | ||||
|     entries: List[Union[KitIpdFile, "KitIpdFolder"]] | ||||
|  | ||||
|     def explain(self) -> None: | ||||
|         log.explain_topic(f"Folder {self.name!r}") | ||||
|         for file in self.files: | ||||
|             log.explain(f"File {file.name!r} (href={file.url!r})") | ||||
|  | ||||
|     def __hash__(self) -> int: | ||||
|         return self.name.__hash__() | ||||
|         for entry in self.entries: | ||||
|             entry.explain() | ||||
|  | ||||
|  | ||||
| class KitIpdCrawler(HttpCrawler): | ||||
| @@ -72,81 +73,96 @@ class KitIpdCrawler(HttpCrawler): | ||||
|  | ||||
|         async with maybe_cl: | ||||
|             for item in await self._fetch_items(): | ||||
|                 item.explain() | ||||
|                 if isinstance(item, KitIpdFolder): | ||||
|                     tasks.append(self._crawl_folder(item)) | ||||
|                     tasks.append(self._crawl_folder(PurePath("."), item)) | ||||
|                 else: | ||||
|                     # Orphan files are placed in the root folder | ||||
|                     tasks.append(self._download_file(PurePath("."), item)) | ||||
|                     log.explain_topic(f"Orphan file {item.name!r} (href={item.url!r})") | ||||
|                     log.explain("Attributing it to root folder") | ||||
|                     # do this here to at least be sequential and not parallel (rate limiting is hard, as the | ||||
|                     # crawl abstraction does not hold for these requests) | ||||
|                     etag, mtime = await self._request_resource_version(item.url) | ||||
|                     tasks.append(self._download_file(PurePath("."), item, etag, mtime)) | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _crawl_folder(self, folder: KitIpdFolder) -> None: | ||||
|         path = PurePath(folder.name) | ||||
|     async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None: | ||||
|         path = parent / folder.name | ||||
|         if not await self.crawl(path): | ||||
|             return | ||||
|  | ||||
|         tasks = [self._download_file(path, file) for file in folder.files] | ||||
|         tasks = [] | ||||
|         for entry in folder.entries: | ||||
|             if isinstance(entry, KitIpdFolder): | ||||
|                 tasks.append(self._crawl_folder(path, entry)) | ||||
|             else: | ||||
|                 # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl | ||||
|                 # abstraction does not hold for these requests) | ||||
|                 etag, mtime = await self._request_resource_version(entry.url) | ||||
|                 tasks.append(self._download_file(path, entry, etag, mtime)) | ||||
|  | ||||
|         await self.gather(tasks) | ||||
|  | ||||
|     async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: | ||||
|     async def _download_file( | ||||
|         self, | ||||
|         parent: PurePath, | ||||
|         file: KitIpdFile, | ||||
|         etag: Optional[str], | ||||
|         mtime: Optional[datetime] | ||||
|     ) -> None: | ||||
|         element_path = parent / file.name | ||||
|         maybe_dl = await self.download(element_path) | ||||
|  | ||||
|         prev_etag = self._get_previous_etag_from_report(element_path) | ||||
|         etag_differs = None if prev_etag is None else prev_etag != etag | ||||
|  | ||||
|         maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime) | ||||
|         if not maybe_dl: | ||||
|             # keep storing the known file's etag | ||||
|             if prev_etag: | ||||
|                 self._add_etag_to_report(element_path, prev_etag) | ||||
|             return | ||||
|  | ||||
|         async with maybe_dl as (bar, sink): | ||||
|             await self._stream_from_url(file.url, sink, bar) | ||||
|             await self._stream_from_url(file.url, element_path, sink, bar) | ||||
|  | ||||
|     async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: | ||||
|     async def _fetch_items(self) -> Iterable[Union[KitIpdFile, KitIpdFolder]]: | ||||
|         page, url = await self.get_page() | ||||
|         elements: List[Tag] = self._find_file_links(page) | ||||
|         items: Set[Union[KitIpdFile, KitIpdFolder]] = set() | ||||
|  | ||||
|         # do not add unnecessary nesting for a single <h1> heading | ||||
|         drop_h1: bool = len(page.find_all(name="h1")) <= 1 | ||||
|  | ||||
|         folder_tree: KitIpdFolder = KitIpdFolder(".", []) | ||||
|         for element in elements: | ||||
|             folder_label = self._find_folder_label(element) | ||||
|             if folder_label: | ||||
|                 folder = self._extract_folder(folder_label, url) | ||||
|                 if folder not in items: | ||||
|                     items.add(folder) | ||||
|                     folder.explain() | ||||
|             else: | ||||
|                 file = self._extract_file(element, url) | ||||
|                 items.add(file) | ||||
|                 log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") | ||||
|                 log.explain("Attributing it to root folder") | ||||
|             parent = HttpCrawler.get_folder_structure_from_heading_hierarchy(element, drop_h1) | ||||
|             file = self._extract_file(element, url) | ||||
|  | ||||
|         return items | ||||
|             current_folder: KitIpdFolder = folder_tree | ||||
|             for folder_name in parent.parts: | ||||
|                 # helps the type checker to verify that current_folder is indeed a folder | ||||
|                 def subfolders() -> Generator[KitIpdFolder, Any, None]: | ||||
|                     return (entry for entry in current_folder.entries if isinstance(entry, KitIpdFolder)) | ||||
|  | ||||
|     def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder: | ||||
|         files: List[KitIpdFile] = [] | ||||
|         name = folder_tag.getText().strip() | ||||
|                 if not any(entry.name == folder_name for entry in subfolders()): | ||||
|                     current_folder.entries.append(KitIpdFolder(folder_name, [])) | ||||
|                 current_folder = next(entry for entry in subfolders() if entry.name == folder_name) | ||||
|  | ||||
|         container: Tag = folder_tag.findNextSibling(name="table") | ||||
|         for link in self._find_file_links(container): | ||||
|             files.append(self._extract_file(link, url)) | ||||
|             current_folder.entries.append(file) | ||||
|  | ||||
|         return KitIpdFolder(name, files) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _find_folder_label(file_link: Tag) -> Optional[Tag]: | ||||
|         enclosing_table: Tag = file_link.findParent(name="table") | ||||
|         if enclosing_table is None: | ||||
|             return None | ||||
|         return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) | ||||
|         return folder_tree.entries | ||||
|  | ||||
|     def _extract_file(self, link: Tag, url: str) -> KitIpdFile: | ||||
|         url = self._abs_url_from_link(url, link) | ||||
|         name = os.path.basename(url) | ||||
|         return KitIpdFile(name, url) | ||||
|  | ||||
|     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: | ||||
|         return tag.findAll(name="a", attrs={"href": self._file_regex}) | ||||
|     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> list[Tag]: | ||||
|         return cast(list[Tag], tag.find_all(name="a", attrs={"href": self._file_regex})) | ||||
|  | ||||
|     def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: | ||||
|         return urljoin(url, link_tag.get("href")) | ||||
|         return urljoin(url, cast(str, link_tag.get("href"))) | ||||
|  | ||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: | ||||
|     async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: | ||||
|         async with self.session.get(url, allow_redirects=False) as resp: | ||||
|             if resp.status == 403: | ||||
|                 raise CrawlError("Received a 403. Are you within the KIT network/VPN?") | ||||
| @@ -159,6 +175,8 @@ class KitIpdCrawler(HttpCrawler): | ||||
|  | ||||
|             sink.done() | ||||
|  | ||||
|             self._add_etag_to_report(path, resp.headers.get("ETag")) | ||||
|  | ||||
|     async def get_page(self) -> Tuple[BeautifulSoup, str]: | ||||
|         async with self.session.get(self._url) as request: | ||||
|             # The web page for Algorithmen für Routenplanung contains some | ||||
|   | ||||
| @@ -1,9 +1,8 @@ | ||||
| import asyncio | ||||
| import sys | ||||
| import traceback | ||||
| from contextlib import asynccontextmanager, contextmanager | ||||
| # TODO In Python 3.9 and above, ContextManager is deprecated | ||||
| from typing import AsyncIterator, ContextManager, Iterator, List, Optional | ||||
| from contextlib import AbstractContextManager, asynccontextmanager, contextmanager | ||||
| from typing import AsyncIterator, Iterator, List, Optional | ||||
|  | ||||
| from rich.console import Console, Group | ||||
| from rich.live import Live | ||||
| @@ -261,7 +260,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | ||||
|             action: str, | ||||
|             text: str, | ||||
|             total: Optional[float] = None, | ||||
|     ) -> ContextManager[ProgressBar]: | ||||
|     ) -> AbstractContextManager[ProgressBar]: | ||||
|         """ | ||||
|         Allows markup in the "style" argument which will be applied to the | ||||
|         "action" string. | ||||
| @@ -277,7 +276,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | ||||
|             action: str, | ||||
|             text: str, | ||||
|             total: Optional[float] = None, | ||||
|     ) -> ContextManager[ProgressBar]: | ||||
|     ) -> AbstractContextManager[ProgressBar]: | ||||
|         """ | ||||
|         Allows markup in the "style" argument which will be applied to the | ||||
|         "action" string. | ||||
|   | ||||
| @@ -57,6 +57,7 @@ class OnConflict(Enum): | ||||
|  | ||||
| @dataclass | ||||
| class Heuristics: | ||||
|     etag_differs: Optional[bool] | ||||
|     mtime: Optional[datetime] | ||||
|  | ||||
|  | ||||
| @@ -233,8 +234,16 @@ class OutputDirectory: | ||||
|  | ||||
|         remote_newer = None | ||||
|  | ||||
|         # ETag should be a more reliable indicator than mtime, so we check it first | ||||
|         if heuristics.etag_differs is not None: | ||||
|             remote_newer = heuristics.etag_differs | ||||
|             if remote_newer: | ||||
|                 log.explain("Remote file's entity tag differs") | ||||
|             else: | ||||
|                 log.explain("Remote file's entity tag is the same") | ||||
|  | ||||
|         # Python on Windows crashes when faced with timestamps around the unix epoch | ||||
|         if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): | ||||
|         if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): | ||||
|             mtime = heuristics.mtime | ||||
|             remote_newer = mtime.timestamp() > stat.st_mtime | ||||
|             if remote_newer: | ||||
| @@ -362,10 +371,28 @@ class OutputDirectory: | ||||
|  | ||||
|         raise OutputDirError("Failed to create temporary file") | ||||
|  | ||||
|     def should_try_download( | ||||
|         self, | ||||
|         path: PurePath, | ||||
|         *, | ||||
|         etag_differs: Optional[bool] = None, | ||||
|         mtime: Optional[datetime] = None, | ||||
|         redownload: Optional[Redownload] = None, | ||||
|         on_conflict: Optional[OnConflict] = None, | ||||
|     ) -> bool: | ||||
|         heuristics = Heuristics(etag_differs, mtime) | ||||
|         redownload = self._redownload if redownload is None else redownload | ||||
|         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||
|         local_path = self.resolve(path) | ||||
|  | ||||
|         return self._should_download(local_path, heuristics, redownload, on_conflict) | ||||
|  | ||||
|     async def download( | ||||
|             self, | ||||
|             remote_path: PurePath, | ||||
|             path: PurePath, | ||||
|             *, | ||||
|             etag_differs: Optional[bool] = None, | ||||
|             mtime: Optional[datetime] = None, | ||||
|             redownload: Optional[Redownload] = None, | ||||
|             on_conflict: Optional[OnConflict] = None, | ||||
| @@ -375,7 +402,7 @@ class OutputDirectory: | ||||
|         MarkConflictError. | ||||
|         """ | ||||
|  | ||||
|         heuristics = Heuristics(mtime) | ||||
|         heuristics = Heuristics(etag_differs, mtime) | ||||
|         redownload = self._redownload if redownload is None else redownload | ||||
|         on_conflict = self._on_conflict if on_conflict is None else on_conflict | ||||
|         local_path = self.resolve(path) | ||||
| @@ -415,7 +442,6 @@ class OutputDirectory: | ||||
|  | ||||
|     def _update_metadata(self, info: DownloadInfo) -> None: | ||||
|         if mtime := info.heuristics.mtime: | ||||
|             log.explain(f"Setting mtime to {mtime}") | ||||
|             mtimestamp = mtime.timestamp() | ||||
|             os.utime(info.local_path, times=(mtimestamp, mtimestamp)) | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| from pathlib import Path | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Dict, List, Optional | ||||
|  | ||||
| from rich.markup import escape | ||||
| @@ -168,19 +168,24 @@ class Pferd: | ||||
|             log.report("") | ||||
|             log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") | ||||
|  | ||||
|             def fmt_path_link(relative_path: PurePath) -> str: | ||||
|                 # We need to URL-encode the path because it might contain spaces or special characters | ||||
|                 link = crawler.output_dir.resolve(relative_path).absolute().as_uri() | ||||
|                 return f"[link={link}]{fmt_path(relative_path)}[/link]" | ||||
|  | ||||
|             something_changed = False | ||||
|             for path in sorted(crawler.report.added_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_green]Added[/] {fmt_path(path)}") | ||||
|                 log.report(f"  [bold bright_green]Added[/] {fmt_path_link(path)}") | ||||
|             for path in sorted(crawler.report.changed_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_yellow]Changed[/] {fmt_path(path)}") | ||||
|                 log.report(f"  [bold bright_yellow]Changed[/] {fmt_path_link(path)}") | ||||
|             for path in sorted(crawler.report.deleted_files): | ||||
|                 something_changed = True | ||||
|                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") | ||||
|             for path in sorted(crawler.report.not_deleted_files): | ||||
|                 something_changed = True | ||||
|                 log.report_not_deleted(f"  [bold bright_magenta]Not deleted[/] {fmt_path(path)}") | ||||
|                 log.report_not_deleted(f"  [bold bright_magenta]Not deleted[/] {fmt_path_link(path)}") | ||||
|  | ||||
|             for warning in crawler.report.encountered_warnings: | ||||
|                 something_changed = True | ||||
|   | ||||
| @@ -34,15 +34,6 @@ class MarkConflictError(Exception): | ||||
|         self.collides_with = collides_with | ||||
|  | ||||
|  | ||||
| # TODO Use PurePath.is_relative_to when updating to 3.9 | ||||
| def is_relative_to(a: PurePath, b: PurePath) -> bool: | ||||
|     try: | ||||
|         a.relative_to(b) | ||||
|         return True | ||||
|     except ValueError: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| class Report: | ||||
|     """ | ||||
|     A report of a synchronization. Includes all files found by the crawler, as | ||||
| @@ -173,7 +164,7 @@ class Report: | ||||
|             if path == other: | ||||
|                 raise MarkDuplicateError(path) | ||||
|  | ||||
|             if is_relative_to(path, other) or is_relative_to(other, path): | ||||
|             if path.is_relative_to(other) or other.is_relative_to(path): | ||||
|                 raise MarkConflictError(path, other) | ||||
|  | ||||
|         self.known_files.add(path) | ||||
|   | ||||
| @@ -110,6 +110,10 @@ class ExactReTf(Transformation): | ||||
|             except ValueError: | ||||
|                 pass | ||||
|  | ||||
|         named_groups: Dict[str, str] = match.groupdict() | ||||
|         for name, capture in named_groups.items(): | ||||
|             locals_dir[name] = capture | ||||
|  | ||||
|         result = eval(f"f{right!r}", {}, locals_dir) | ||||
|         return Transformed(PurePath(result)) | ||||
|  | ||||
|   | ||||
| @@ -1,2 +1,2 @@ | ||||
| NAME = "PFERD" | ||||
| VERSION = "3.5.0" | ||||
| VERSION = "3.8.2" | ||||
|   | ||||
							
								
								
									
										13
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								README.md
									
									
									
									
									
								
							| @@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the | ||||
|  | ||||
| ### With pip | ||||
|  | ||||
| Ensure you have at least Python 3.9 installed. Run the following command to | ||||
| Ensure you have at least Python 3.11 installed. Run the following command to | ||||
| install PFERD or upgrade it to the latest version: | ||||
|  | ||||
| ``` | ||||
| @@ -56,6 +56,17 @@ Also, you can download most ILIAS pages directly like this: | ||||
| $ pferd kit-ilias-web <url> <output_directory> | ||||
| ``` | ||||
|  | ||||
| PFERD supports other ILIAS instances as well, using the `ilias-web` crawler (see | ||||
| the [config section on `ilias-web`](CONFIG.md#the-ilias-web-crawler) for more | ||||
| detail on the `base-url` and `client-id` parameters): | ||||
|  | ||||
| ``` | ||||
| $ pferd ilias-web \ | ||||
|     --base-url https://ilias.my-university.example \ | ||||
|     --client-id My_University desktop \ | ||||
|     <output_directory> | ||||
| ``` | ||||
|  | ||||
| However, the CLI only lets you download a single thing at a time, and the | ||||
| resulting command can grow long quite quickly. Because of this, PFERD can also | ||||
| be used with a config file. | ||||
|   | ||||
							
								
								
									
										8
									
								
								flake.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										8
									
								
								flake.lock
									
									
									
										generated
									
									
									
								
							| @@ -2,16 +2,16 @@ | ||||
|   "nodes": { | ||||
|     "nixpkgs": { | ||||
|       "locked": { | ||||
|         "lastModified": 1694499547, | ||||
|         "narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=", | ||||
|         "lastModified": 1744440957, | ||||
|         "narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=", | ||||
|         "owner": "NixOS", | ||||
|         "repo": "nixpkgs", | ||||
|         "rev": "e5f018cf150e29aac26c61dac0790ea023c46b24", | ||||
|         "rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d", | ||||
|         "type": "github" | ||||
|       }, | ||||
|       "original": { | ||||
|         "owner": "NixOS", | ||||
|         "ref": "nixos-23.05", | ||||
|         "ref": "nixos-24.11", | ||||
|         "repo": "nixpkgs", | ||||
|         "type": "github" | ||||
|       } | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
|   description = "Tool for downloading course-related files from ILIAS"; | ||||
|  | ||||
|   inputs = { | ||||
|     nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; | ||||
|     nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11"; | ||||
|   }; | ||||
|  | ||||
|   outputs = { self, nixpkgs }: | ||||
|   | ||||
							
								
								
									
										11
									
								
								mypy.ini
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								mypy.ini
									
									
									
									
									
								
							| @@ -1,11 +0,0 @@ | ||||
| [mypy] | ||||
| disallow_any_generics = True | ||||
| disallow_untyped_defs = True | ||||
| disallow_incomplete_defs = True | ||||
| no_implicit_optional = True | ||||
| warn_unused_ignores = True | ||||
| warn_unreachable = True | ||||
| show_error_context = True | ||||
|  | ||||
| [mypy-rich.*,bs4,keyring] | ||||
| ignore_missing_imports = True | ||||
| @@ -1,3 +1,42 @@ | ||||
| [build-system] | ||||
| requires = ["setuptools", "wheel"] | ||||
| build-backend = "setuptools.build_meta" | ||||
|  | ||||
| [project] | ||||
| name = "PFERD" | ||||
| dependencies = [ | ||||
|   "aiohttp>=3.8.1", | ||||
|   "beautifulsoup4>=4.10.0", | ||||
|   "rich>=11.0.0", | ||||
|   "keyring>=23.5.0", | ||||
|   "certifi>=2021.10.8" | ||||
| ] | ||||
| dynamic = ["version"] | ||||
| requires-python = ">=3.11" | ||||
|  | ||||
| [project.scripts] | ||||
| pferd = "PFERD.__main__:main" | ||||
|  | ||||
| [tool.setuptools.dynamic] | ||||
| version = {attr = "PFERD.version.VERSION"} | ||||
|  | ||||
| [tool.flake8] | ||||
| max-line-length = 110 | ||||
|  | ||||
| [tool.isort] | ||||
| line_length = 110 | ||||
|  | ||||
| [tool.autopep8] | ||||
| max_line_length = 110 | ||||
| in-place = true | ||||
| recursive = true | ||||
|  | ||||
| [tool.mypy] | ||||
| disallow_any_generics = true | ||||
| disallow_untyped_defs = true | ||||
| disallow_incomplete_defs = true | ||||
| no_implicit_optional = true | ||||
| warn_unused_ignores = true | ||||
| warn_unreachable = true | ||||
| show_error_context = true | ||||
| ignore_missing_imports = true | ||||
|   | ||||
| @@ -1,8 +1,8 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| import argparse | ||||
| import time | ||||
| import re | ||||
| import time | ||||
| from subprocess import run | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -2,5 +2,5 @@ | ||||
|  | ||||
| set -e | ||||
|  | ||||
| mypy PFERD | ||||
| mypy . | ||||
| flake8 PFERD | ||||
|   | ||||
| @@ -2,5 +2,5 @@ | ||||
|  | ||||
| set -e | ||||
|  | ||||
| autopep8 --recursive --in-place PFERD | ||||
| isort PFERD | ||||
| autopep8 . | ||||
| isort . | ||||
|   | ||||
| @@ -13,5 +13,5 @@ pip install --upgrade setuptools | ||||
| pip install --editable . | ||||
|  | ||||
| # Installing tools and type hints | ||||
| pip install --upgrade mypy flake8 autopep8 isort pyinstaller | ||||
| pip install --upgrade mypy flake8 flake8-pyproject autopep8 isort pyinstaller | ||||
| pip install --upgrade types-chardet types-certifi | ||||
|   | ||||
							
								
								
									
										23
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								setup.cfg
									
									
									
									
									
								
							| @@ -1,23 +0,0 @@ | ||||
| [metadata] | ||||
| name = PFERD | ||||
| version = attr: PFERD.version.VERSION | ||||
|  | ||||
| [options] | ||||
| packages = find: | ||||
| python_requires = >=3.9 | ||||
| install_requires = | ||||
|   aiohttp>=3.8.1 | ||||
|   beautifulsoup4>=4.10.0 | ||||
|   rich>=11.0.0 | ||||
|   keyring>=23.5.0 | ||||
|   certifi>=2021.10.8 | ||||
|  | ||||
| [options.entry_points] | ||||
| console_scripts = | ||||
|   pferd = PFERD.__main__:main | ||||
|  | ||||
| [flake8] | ||||
| max_line_length = 110 | ||||
|  | ||||
| [isort] | ||||
| line_length = 110 | ||||
		Reference in New Issue
	
	Block a user