mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-25 11:02:30 +02:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			v3.5.2
			...
			update-che
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 2d145e7c94 | 
							
								
								
									
										8
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/build-and-release.yml
									
									
									
									
										vendored
									
									
								
							| @@ -17,9 +17,9 @@ jobs: | |||||||
|         python: ["3.9"] |         python: ["3.9"] | ||||||
|     steps: |     steps: | ||||||
|  |  | ||||||
|       - uses: actions/checkout@v3 |       - uses: actions/checkout@v2 | ||||||
|  |  | ||||||
|       - uses: actions/setup-python@v4 |       - uses: actions/setup-python@v2 | ||||||
|         with: |         with: | ||||||
|           python-version: ${{ matrix.python }} |           python-version: ${{ matrix.python }} | ||||||
|  |  | ||||||
| @@ -45,7 +45,7 @@ jobs: | |||||||
|         run: mv dist/pferd* dist/pferd-${{ matrix.os }} |         run: mv dist/pferd* dist/pferd-${{ matrix.os }} | ||||||
|  |  | ||||||
|       - name: Upload binary |       - name: Upload binary | ||||||
|         uses: actions/upload-artifact@v3 |         uses: actions/upload-artifact@v2 | ||||||
|         with: |         with: | ||||||
|           name: Binaries |           name: Binaries | ||||||
|           path: dist/pferd-${{ matrix.os }} |           path: dist/pferd-${{ matrix.os }} | ||||||
| @@ -57,7 +57,7 @@ jobs: | |||||||
|     steps: |     steps: | ||||||
|  |  | ||||||
|       - name: Download binaries |       - name: Download binaries | ||||||
|         uses: actions/download-artifact@v3 |         uses: actions/download-artifact@v2 | ||||||
|         with: |         with: | ||||||
|           name: Binaries |           name: Binaries | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -3,7 +3,6 @@ | |||||||
| /PFERD.egg-info/ | /PFERD.egg-info/ | ||||||
| __pycache__/ | __pycache__/ | ||||||
| /.vscode/ | /.vscode/ | ||||||
| /.idea/ |  | ||||||
|  |  | ||||||
| # pyinstaller | # pyinstaller | ||||||
| /pferd.spec | /pferd.spec | ||||||
|   | |||||||
							
								
								
									
										60
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								CHANGELOG.md
									
									
									
									
									
								
							| @@ -22,67 +22,9 @@ ambiguous situations. | |||||||
|  |  | ||||||
| ## Unreleased | ## Unreleased | ||||||
|  |  | ||||||
| ## 3.5.2 - 2024-04-14 |  | ||||||
|  |  | ||||||
| ### Fixed | ### Fixed | ||||||
| - Crawling of personal desktop with ILIAS 8 | - Forum crawling crashing when parsing empty (= 0 messages) threads | ||||||
| - Crawling of empty personal desktops |  | ||||||
|  |  | ||||||
| ## 3.5.1 - 2024-04-09 |  | ||||||
|  |  | ||||||
| ### Added |  | ||||||
| - Support for ILIAS 8 |  | ||||||
|  |  | ||||||
| ### Fixed |  | ||||||
| - Video name deduplication |  | ||||||
|  |  | ||||||
| ## 3.5.0 - 2023-09-13 |  | ||||||
|  |  | ||||||
| ### Added |  | ||||||
| - `no-delete-prompt-override` conflict resolution strategy |  | ||||||
| - Support for ILIAS learning modules |  | ||||||
| - `show_not_deleted` option to stop printing the "Not Deleted" status or report |  | ||||||
|   message. This combines nicely with the `no-delete-prompt-override` strategy, |  | ||||||
|   causing PFERD to mostly ignore local-only files. |  | ||||||
| - Support for mediacast video listings |  | ||||||
| - Crawling of files in info tab |  | ||||||
|  |  | ||||||
| ### Changed |  | ||||||
| - Remove size suffix for files in content pages |  | ||||||
|  |  | ||||||
| ### Fixed |  | ||||||
| - Crawling of courses with the timeline view as the default tab |  | ||||||
| - Crawling of file and custom opencast cards |  | ||||||
| - Crawling of button cards without descriptions |  | ||||||
| - Abort crawling when encountering an unexpected ilias root page redirect |  | ||||||
| - Sanitize ascii control characters on Windows |  | ||||||
| - Crawling of paginated past meetings |  | ||||||
| - Ignore SCORM learning modules |  | ||||||
|  |  | ||||||
| ## 3.4.3 - 2022-11-29 |  | ||||||
|  |  | ||||||
| ### Added |  | ||||||
| - Missing documentation for `forums` option |  | ||||||
|  |  | ||||||
| ### Changed |  | ||||||
| - Clear up error message shown when multiple paths are found to an element |  | ||||||
|  |  | ||||||
| ### Fixed |  | ||||||
| - IPD crawler unnecessarily appending trailing slashes |  | ||||||
| - Crawling opencast when ILIAS is set to English |  | ||||||
|  |  | ||||||
| ## 3.4.2 - 2022-10-26 |  | ||||||
|  |  | ||||||
| ### Added |  | ||||||
| - Recognize and crawl content pages in cards |  | ||||||
| - Recognize and ignore surveys |  | ||||||
|  |  | ||||||
| ### Fixed |  | ||||||
| - Forum crawling crashing when a thread has no messages at all |  | ||||||
| - Forum crawling crashing when a forum has no threads at all | - Forum crawling crashing when a forum has no threads at all | ||||||
| - Ilias login failing in some cases |  | ||||||
| - Crawling of paginated future meetings |  | ||||||
| - IPD crawler handling of URLs without trailing slash |  | ||||||
|  |  | ||||||
| ## 3.4.1 - 2022-08-17 | ## 3.4.1 - 2022-08-17 | ||||||
|  |  | ||||||
|   | |||||||
| @@ -26,9 +26,6 @@ default values for the other sections. | |||||||
|   `Added ...`) while running a crawler. (Default: `yes`) |   `Added ...`) while running a crawler. (Default: `yes`) | ||||||
| - `report`: Whether PFERD should print a report of added, changed and deleted | - `report`: Whether PFERD should print a report of added, changed and deleted | ||||||
|    local files for all crawlers before exiting. (Default: `yes`) |    local files for all crawlers before exiting. (Default: `yes`) | ||||||
| - `show_not_deleted`: Whether PFERD should print messages in status and report |  | ||||||
|    when a local-only file wasn't deleted. Combines nicely with the |  | ||||||
|    `no-delete-prompt-override` conflict resolution strategy. |  | ||||||
| - `share_cookies`: Whether crawlers should share cookies where applicable. For | - `share_cookies`: Whether crawlers should share cookies where applicable. For | ||||||
|   example, some crawlers share cookies if they crawl the same website using the |   example, some crawlers share cookies if they crawl the same website using the | ||||||
|   same account. (Default: `yes`) |   same account. (Default: `yes`) | ||||||
| @@ -78,9 +75,6 @@ common to all crawlers: | |||||||
|       using `prompt` and always choosing "yes". |       using `prompt` and always choosing "yes". | ||||||
|     - `no-delete`: Never delete local files, but overwrite local files if the |     - `no-delete`: Never delete local files, but overwrite local files if the | ||||||
|       remote file is different. |       remote file is different. | ||||||
|     - `no-delete-prompt-overwrite`: Never delete local files, but prompt to |  | ||||||
|       overwrite local files if the remote file is different. Combines nicely |  | ||||||
|       with the `show_not_deleted` option. |  | ||||||
| - `transform`: Rules for renaming and excluding certain files and directories. | - `transform`: Rules for renaming and excluding certain files and directories. | ||||||
|   For more details, see [this section](#transformation-rules). (Default: empty) |   For more details, see [this section](#transformation-rules). (Default: empty) | ||||||
| - `tasks`: The maximum number of concurrent tasks (such as crawling or | - `tasks`: The maximum number of concurrent tasks (such as crawling or | ||||||
| @@ -187,7 +181,6 @@ script once per day should be fine. | |||||||
|   redirect to the actual URL. Set to a negative value to disable the automatic |   redirect to the actual URL. Set to a negative value to disable the automatic | ||||||
|   redirect. (Default: `-1`) |   redirect. (Default: `-1`) | ||||||
| - `videos`: Whether to download videos. (Default: `no`) | - `videos`: Whether to download videos. (Default: `no`) | ||||||
| - `forums`: Whether to download forum threads. (Default: `no`) |  | ||||||
| - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: | ||||||
|   `20.0`) |   `20.0`) | ||||||
|  |  | ||||||
| @@ -296,7 +289,7 @@ path matches `SOURCE`, it is renamed to `TARGET`. | |||||||
| Example: `foo/bar --> baz` | Example: `foo/bar --> baz` | ||||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
| - Converts `foo/bar` into `baz` | - Converts `foo/bar` into `baz` | ||||||
| - Converts `foo/bar/wargl` into `baz/wargl` | - Converts `foo/bar/wargl` into `bar/wargl` | ||||||
|  |  | ||||||
| Example: `foo/bar --> !` | Example: `foo/bar --> !` | ||||||
| - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | - Doesn't match `foo`, `a/foo/bar` or `foo/baz` | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								LICENSE
									
									
									
									
									
								
							| @@ -1,6 +1,5 @@ | |||||||
| Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, | ||||||
|                     TheChristophe, Scriptim, thelukasprobst, Toorero, |                     TheChristophe, Scriptim, thelukasprobst, Toorero | ||||||
|                     Mr-Pine |  | ||||||
|  |  | ||||||
| Permission is hereby granted, free of charge, to any person obtaining a copy of | Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||||
| this software and associated documentation files (the "Software"), to deal in | this software and associated documentation files (the "Software"), to deal in | ||||||
|   | |||||||
| @@ -5,6 +5,8 @@ import os | |||||||
| import sys | import sys | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  |  | ||||||
|  | from PFERD.update import check_for_updates | ||||||
|  |  | ||||||
| from .auth import AuthLoadError | from .auth import AuthLoadError | ||||||
| from .cli import PARSER, ParserLoadError, load_default_section | from .cli import PARSER, ParserLoadError, load_default_section | ||||||
| from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError | from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError | ||||||
| @@ -47,8 +49,6 @@ def configure_logging_from_args(args: argparse.Namespace) -> None: | |||||||
|         log.output_explain = args.explain |         log.output_explain = args.explain | ||||||
|     if args.status is not None: |     if args.status is not None: | ||||||
|         log.output_status = args.status |         log.output_status = args.status | ||||||
|     if args.show_not_deleted is not None: |  | ||||||
|         log.output_not_deleted = args.show_not_deleted |  | ||||||
|     if args.report is not None: |     if args.report is not None: | ||||||
|         log.output_report = args.report |         log.output_report = args.report | ||||||
|  |  | ||||||
| @@ -74,8 +74,6 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N | |||||||
|             log.output_status = config.default_section.status() |             log.output_status = config.default_section.status() | ||||||
|         if args.report is None: |         if args.report is None: | ||||||
|             log.output_report = config.default_section.report() |             log.output_report = config.default_section.report() | ||||||
|         if args.show_not_deleted is None: |  | ||||||
|             log.output_not_deleted = config.default_section.show_not_deleted() |  | ||||||
|     except ConfigOptionError as e: |     except ConfigOptionError as e: | ||||||
|         log.error(str(e)) |         log.error(str(e)) | ||||||
|         sys.exit(1) |         sys.exit(1) | ||||||
| @@ -138,6 +136,11 @@ def main() -> None: | |||||||
|             loop.run_until_complete(asyncio.sleep(1)) |             loop.run_until_complete(asyncio.sleep(1)) | ||||||
|             loop.close() |             loop.close() | ||||||
|         else: |         else: | ||||||
|  |             log.explain_topic("Checking for updates") | ||||||
|  |             if not args.skip_update_check: | ||||||
|  |                 asyncio.run(check_for_updates()) | ||||||
|  |             else: | ||||||
|  |                 log.explain("Update check skipped due to configuration option") | ||||||
|             asyncio.run(pferd.run(args.debug_transforms)) |             asyncio.run(pferd.run(args.debug_transforms)) | ||||||
|     except (ConfigOptionError, AuthLoadError) as e: |     except (ConfigOptionError, AuthLoadError) as e: | ||||||
|         log.unlock() |         log.unlock() | ||||||
|   | |||||||
| @@ -151,6 +151,11 @@ PARSER.add_argument( | |||||||
|     action="version", |     action="version", | ||||||
|     version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", |     version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", | ||||||
| ) | ) | ||||||
|  | PARSER.add_argument( | ||||||
|  |     "--skip-update-check", | ||||||
|  |     action="store_true", | ||||||
|  |     help="disable automatic update checks at startup" | ||||||
|  | ) | ||||||
| PARSER.add_argument( | PARSER.add_argument( | ||||||
|     "--config", "-c", |     "--config", "-c", | ||||||
|     type=Path, |     type=Path, | ||||||
| @@ -215,11 +220,6 @@ PARSER.add_argument( | |||||||
|     action=BooleanOptionalAction, |     action=BooleanOptionalAction, | ||||||
|     help="whether crawlers should share cookies where applicable" |     help="whether crawlers should share cookies where applicable" | ||||||
| ) | ) | ||||||
| PARSER.add_argument( |  | ||||||
|     "--show-not-deleted", |  | ||||||
|     action=BooleanOptionalAction, |  | ||||||
|     help="print messages in status and report when PFERD did not delete a local only file" |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def load_default_section( | def load_default_section( | ||||||
| @@ -238,8 +238,6 @@ def load_default_section( | |||||||
|         section["report"] = "yes" if args.report else "no" |         section["report"] = "yes" if args.report else "no" | ||||||
|     if args.share_cookies is not None: |     if args.share_cookies is not None: | ||||||
|         section["share_cookies"] = "yes" if args.share_cookies else "no" |         section["share_cookies"] = "yes" if args.share_cookies else "no" | ||||||
|     if args.show_not_deleted is not None: |  | ||||||
|         section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| SUBPARSERS = PARSER.add_subparsers(title="crawlers") | SUBPARSERS = PARSER.add_subparsers(title="crawlers") | ||||||
|   | |||||||
| @@ -82,9 +82,6 @@ class DefaultSection(Section): | |||||||
|     def report(self) -> bool: |     def report(self) -> bool: | ||||||
|         return self.s.getboolean("report", fallback=True) |         return self.s.getboolean("report", fallback=True) | ||||||
|  |  | ||||||
|     def show_not_deleted(self) -> bool: |  | ||||||
|         return self.s.getboolean("show_not_deleted", fallback=True) |  | ||||||
|  |  | ||||||
|     def share_cookies(self) -> bool: |     def share_cookies(self) -> bool: | ||||||
|         return self.s.getboolean("share_cookies", fallback=True) |         return self.s.getboolean("share_cookies", fallback=True) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,10 +1,6 @@ | |||||||
| from enum import Enum | from enum import Enum | ||||||
| from typing import Optional | from typing import Optional | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
|  |  | ||||||
| from PFERD.utils import soupify |  | ||||||
|  |  | ||||||
| _link_template_plain = "{{link}}" | _link_template_plain = "{{link}}" | ||||||
| _link_template_fancy = """ | _link_template_fancy = """ | ||||||
| <!DOCTYPE html> | <!DOCTYPE html> | ||||||
| @@ -98,71 +94,6 @@ _link_template_internet_shortcut = """ | |||||||
| URL={{link}} | URL={{link}} | ||||||
| """.strip() | """.strip() | ||||||
|  |  | ||||||
| _learning_module_template = """ |  | ||||||
| <!DOCTYPE html> |  | ||||||
| <html lang="en"> |  | ||||||
|     <head> |  | ||||||
|         <meta charset="UTF-8"> |  | ||||||
|         <title>{{name}}</title> |  | ||||||
|     </head> |  | ||||||
|  |  | ||||||
|     <style> |  | ||||||
|     * { |  | ||||||
|         box-sizing: border-box; |  | ||||||
|     } |  | ||||||
|     .center-flex { |  | ||||||
|         display: flex; |  | ||||||
|         align-items: center; |  | ||||||
|         justify-content: center; |  | ||||||
|     } |  | ||||||
|     .nav { |  | ||||||
|         display: flex; |  | ||||||
|         justify-content: space-between; |  | ||||||
|     } |  | ||||||
|     </style> |  | ||||||
|     <body class="center-flex"> |  | ||||||
| {{body}} |  | ||||||
|     </body> |  | ||||||
| </html> |  | ||||||
| """ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: |  | ||||||
|     # Seems to be comments, ignore those. |  | ||||||
|     for elem in body.select(".il-copg-mob-fullscreen-modal"): |  | ||||||
|         elem.decompose() |  | ||||||
|  |  | ||||||
|     nav_template = """ |  | ||||||
|         <div class="nav"> |  | ||||||
|             {{left}} |  | ||||||
|             {{right}} |  | ||||||
|         </div> |  | ||||||
|     """ |  | ||||||
|     if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): |  | ||||||
|         text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() |  | ||||||
|         left = f'<a href="{prev}">{text}</a>' |  | ||||||
|     else: |  | ||||||
|         left = "<span></span>" |  | ||||||
|  |  | ||||||
|     if next and body.select_one(".ilc_page_rnav_RightNavigation"): |  | ||||||
|         text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() |  | ||||||
|         right = f'<a href="{next}">{text}</a>' |  | ||||||
|     else: |  | ||||||
|         right = "<span></span>" |  | ||||||
|  |  | ||||||
|     if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): |  | ||||||
|         top_nav.replace_with( |  | ||||||
|             soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): |  | ||||||
|         bot_nav.replace_with(soupify(nav_template.replace( |  | ||||||
|             "{{left}}", left).replace("{{right}}", right).encode()) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     body = body.prettify() |  | ||||||
|     return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class Links(Enum): | class Links(Enum): | ||||||
|     IGNORE = "ignore" |     IGNORE = "ignore" | ||||||
| @@ -171,24 +102,24 @@ class Links(Enum): | |||||||
|     INTERNET_SHORTCUT = "internet-shortcut" |     INTERNET_SHORTCUT = "internet-shortcut" | ||||||
|  |  | ||||||
|     def template(self) -> Optional[str]: |     def template(self) -> Optional[str]: | ||||||
|         if self == Links.FANCY: |         if self == self.FANCY: | ||||||
|             return _link_template_fancy |             return _link_template_fancy | ||||||
|         elif self == Links.PLAINTEXT: |         elif self == self.PLAINTEXT: | ||||||
|             return _link_template_plain |             return _link_template_plain | ||||||
|         elif self == Links.INTERNET_SHORTCUT: |         elif self == self.INTERNET_SHORTCUT: | ||||||
|             return _link_template_internet_shortcut |             return _link_template_internet_shortcut | ||||||
|         elif self == Links.IGNORE: |         elif self == self.IGNORE: | ||||||
|             return None |             return None | ||||||
|         raise ValueError("Missing switch case") |         raise ValueError("Missing switch case") | ||||||
|  |  | ||||||
|     def extension(self) -> Optional[str]: |     def extension(self) -> Optional[str]: | ||||||
|         if self == Links.FANCY: |         if self == self.FANCY: | ||||||
|             return ".html" |             return ".html" | ||||||
|         elif self == Links.PLAINTEXT: |         elif self == self.PLAINTEXT: | ||||||
|             return ".txt" |             return ".txt" | ||||||
|         elif self == Links.INTERNET_SHORTCUT: |         elif self == self.INTERNET_SHORTCUT: | ||||||
|             return ".url" |             return ".url" | ||||||
|         elif self == Links.IGNORE: |         elif self == self.IGNORE: | ||||||
|             return None |             return None | ||||||
|         raise ValueError("Missing switch case") |         raise ValueError("Missing switch case") | ||||||
|  |  | ||||||
|   | |||||||
| @@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: | |||||||
|             dummy.decompose() |             dummy.decompose() | ||||||
|         if len(children) > 1: |         if len(children) > 1: | ||||||
|             continue |             continue | ||||||
|         if isinstance(type(children[0]), Comment): |         if type(children[0]) == Comment: | ||||||
|             dummy.decompose() |             dummy.decompose() | ||||||
|  |  | ||||||
|     for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): |     for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ import re | |||||||
| from dataclasses import dataclass | from dataclasses import dataclass | ||||||
| from datetime import date, datetime, timedelta | from datetime import date, datetime, timedelta | ||||||
| from enum import Enum | from enum import Enum | ||||||
| from typing import Dict, List, Optional, Union, cast | from typing import Dict, List, Optional, Union | ||||||
| from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||||
|  |  | ||||||
| from bs4 import BeautifulSoup, Tag | from bs4 import BeautifulSoup, Tag | ||||||
| @@ -22,18 +22,12 @@ class IliasElementType(Enum): | |||||||
|     FOLDER = "folder" |     FOLDER = "folder" | ||||||
|     FORUM = "forum" |     FORUM = "forum" | ||||||
|     LINK = "link" |     LINK = "link" | ||||||
|     INFO_TAB = "info_tab" |  | ||||||
|     LEARNING_MODULE = "learning_module" |  | ||||||
|     BOOKING = "booking" |     BOOKING = "booking" | ||||||
|     MEETING = "meeting" |     MEETING = "meeting" | ||||||
|     SURVEY = "survey" |     VIDEO = "video" | ||||||
|     SCORM_LEARNING_MODULE = "scorm_learning_module" |     VIDEO_PLAYER = "video_player" | ||||||
|     MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" |     VIDEO_FOLDER = "video_folder" | ||||||
|     MEDIACAST_VIDEO = "mediacast_video" |     VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" | ||||||
|     OPENCAST_VIDEO = "opencast_video" |  | ||||||
|     OPENCAST_VIDEO_PLAYER = "opencast_video_player" |  | ||||||
|     OPENCAST_VIDEO_FOLDER = "opencast_video_folder" |  | ||||||
|     OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass | @dataclass | ||||||
| @@ -49,8 +43,7 @@ class IliasPageElement: | |||||||
|             r"eid=(?P<id>[0-9a-z\-]+)", |             r"eid=(?P<id>[0-9a-z\-]+)", | ||||||
|             r"file_(?P<id>\d+)", |             r"file_(?P<id>\d+)", | ||||||
|             r"ref_id=(?P<id>\d+)", |             r"ref_id=(?P<id>\d+)", | ||||||
|             r"target=[a-z]+_(?P<id>\d+)", |             r"target=[a-z]+_(?P<id>\d+)" | ||||||
|             r"mm_(?P<id>\d+)" |  | ||||||
|         ] |         ] | ||||||
|  |  | ||||||
|         for regex in regexes: |         for regex in regexes: | ||||||
| @@ -77,14 +70,6 @@ class IliasForumThread: | |||||||
|     mtime: Optional[datetime] |     mtime: Optional[datetime] | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class IliasLearningModulePage: |  | ||||||
|     title: str |  | ||||||
|     content: Tag |  | ||||||
|     next_url: Optional[str] |  | ||||||
|     previous_url: Optional[str] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasPage: | class IliasPage: | ||||||
|  |  | ||||||
|     def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): |     def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): | ||||||
| @@ -93,12 +78,6 @@ class IliasPage: | |||||||
|         self._page_type = source_element.type if source_element else None |         self._page_type = source_element.type if source_element else None | ||||||
|         self._source_name = source_element.name if source_element else "" |         self._source_name = source_element.name if source_element else "" | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def is_root_page(soup: BeautifulSoup) -> bool: |  | ||||||
|         if permalink := IliasPage.get_soup_permalink(soup): |  | ||||||
|             return "goto.php?target=root_" in permalink |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     def get_child_elements(self) -> List[IliasPageElement]: |     def get_child_elements(self) -> List[IliasPageElement]: | ||||||
|         """ |         """ | ||||||
|         Return all child page elements you can find here. |         Return all child page elements you can find here. | ||||||
| @@ -106,9 +85,9 @@ class IliasPage: | |||||||
|         if self._is_video_player(): |         if self._is_video_player(): | ||||||
|             log.explain("Page is a video player, extracting URL") |             log.explain("Page is a video player, extracting URL") | ||||||
|             return self._player_to_video() |             return self._player_to_video() | ||||||
|         if self._is_opencast_video_listing(): |         if self._is_video_listing(): | ||||||
|             log.explain("Page is an opencast video listing, searching for elements") |             log.explain("Page is a video listing, searching for elements") | ||||||
|             return self._find_opencast_video_entries() |             return self._find_video_entries() | ||||||
|         if self._is_exercise_file(): |         if self._is_exercise_file(): | ||||||
|             log.explain("Page is an exercise, searching for elements") |             log.explain("Page is an exercise, searching for elements") | ||||||
|             return self._find_exercise_entries() |             return self._find_exercise_entries() | ||||||
| @@ -118,25 +97,9 @@ class IliasPage: | |||||||
|         if self._is_content_page(): |         if self._is_content_page(): | ||||||
|             log.explain("Page is a content page, searching for elements") |             log.explain("Page is a content page, searching for elements") | ||||||
|             return self._find_copa_entries() |             return self._find_copa_entries() | ||||||
|         if self._is_info_tab(): |  | ||||||
|             log.explain("Page is info tab, searching for elements") |  | ||||||
|             return self._find_info_tab_entries() |  | ||||||
|         log.explain("Page is a normal folder, searching for elements") |         log.explain("Page is a normal folder, searching for elements") | ||||||
|         return self._find_normal_entries() |         return self._find_normal_entries() | ||||||
|  |  | ||||||
|     def get_info_tab(self) -> Optional[IliasPageElement]: |  | ||||||
|         tab: Optional[Tag] = self._soup.find( |  | ||||||
|             name="a", |  | ||||||
|             attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x} |  | ||||||
|         ) |  | ||||||
|         if tab is not None: |  | ||||||
|             return IliasPageElement( |  | ||||||
|                 IliasElementType.INFO_TAB, |  | ||||||
|                 self._abs_url_from_link(tab), |  | ||||||
|                 "infos" |  | ||||||
|             ) |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def get_description(self) -> Optional[BeautifulSoup]: |     def get_description(self) -> Optional[BeautifulSoup]: | ||||||
|         def is_interesting_class(name: str) -> bool: |         def is_interesting_class(name: str) -> bool: | ||||||
|             return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] |             return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] | ||||||
| @@ -162,34 +125,6 @@ class IliasPage: | |||||||
|  |  | ||||||
|         return BeautifulSoup(raw_html, "html.parser") |         return BeautifulSoup(raw_html, "html.parser") | ||||||
|  |  | ||||||
|     def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: |  | ||||||
|         if not self._is_learning_module_page(): |  | ||||||
|             return None |  | ||||||
|         content = self._soup.select_one("#ilLMPageContent") |  | ||||||
|         title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() |  | ||||||
|         return IliasLearningModulePage( |  | ||||||
|             title=title, |  | ||||||
|             content=content, |  | ||||||
|             next_url=self._find_learning_module_next(), |  | ||||||
|             previous_url=self._find_learning_module_prev() |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def _find_learning_module_next(self) -> Optional[str]: |  | ||||||
|         for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"): |  | ||||||
|             url = self._abs_url_from_link(link) |  | ||||||
|             if "baseClass=ilLMPresentationGUI" not in url: |  | ||||||
|                 continue |  | ||||||
|             return url |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def _find_learning_module_prev(self) -> Optional[str]: |  | ||||||
|         for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"): |  | ||||||
|             url = self._abs_url_from_link(link) |  | ||||||
|             if "baseClass=ilLMPresentationGUI" not in url: |  | ||||||
|                 continue |  | ||||||
|             return url |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: |     def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: | ||||||
|         form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) |         form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) | ||||||
|         if not form: |         if not form: | ||||||
| @@ -198,7 +133,7 @@ class IliasPage: | |||||||
|  |  | ||||||
|         thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] |         thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] | ||||||
|  |  | ||||||
|         form_data: Dict[str, Union[str, List[str]]] = { |         form_data: Dict[str, Union[str, List[ſtr]]] = { | ||||||
|             "thread_ids[]": thread_ids, |             "thread_ids[]": thread_ids, | ||||||
|             "selected_cmd2": "html", |             "selected_cmd2": "html", | ||||||
|             "select_cmd2": "Ausführen", |             "select_cmd2": "Ausführen", | ||||||
| @@ -216,18 +151,12 @@ class IliasPage: | |||||||
|         if self._is_ilias_opencast_embedding(): |         if self._is_ilias_opencast_embedding(): | ||||||
|             log.explain("Unwrapping opencast embedding") |             log.explain("Unwrapping opencast embedding") | ||||||
|             return self.get_child_elements()[0] |             return self.get_child_elements()[0] | ||||||
|         if self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED: |         if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: | ||||||
|             log.explain("Unwrapping video pagination") |             log.explain("Unwrapping video pagination") | ||||||
|             return self._find_opencast_video_entries_paginated()[0] |             return self._find_video_entries_paginated()[0] | ||||||
|         if self._contains_collapsed_future_meetings(): |         if self._contains_collapsed_future_meetings(): | ||||||
|             log.explain("Requesting *all* future meetings") |             log.explain("Requesting *all* future meetings") | ||||||
|             return self._uncollapse_future_meetings_url() |             return self._uncollapse_future_meetings_url() | ||||||
|         if not self._is_content_tab_selected(): |  | ||||||
|             if self._page_type != IliasElementType.INFO_TAB: |  | ||||||
|                 log.explain("Selecting content tab") |  | ||||||
|                 return self._select_content_page_url() |  | ||||||
|             else: |  | ||||||
|                 log.explain("Crawling info tab, skipping content select") |  | ||||||
|         return None |         return None | ||||||
|  |  | ||||||
|     def _is_forum_page(self) -> bool: |     def _is_forum_page(self) -> bool: | ||||||
| @@ -240,7 +169,7 @@ class IliasPage: | |||||||
|     def _is_video_player(self) -> bool: |     def _is_video_player(self) -> bool: | ||||||
|         return "paella_config_file" in str(self._soup) |         return "paella_config_file" in str(self._soup) | ||||||
|  |  | ||||||
|     def _is_opencast_video_listing(self) -> bool: |     def _is_video_listing(self) -> bool: | ||||||
|         if self._is_ilias_opencast_embedding(): |         if self._is_ilias_opencast_embedding(): | ||||||
|             return True |             return True | ||||||
|  |  | ||||||
| @@ -275,53 +204,21 @@ class IliasPage: | |||||||
|         return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) |         return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) | ||||||
|  |  | ||||||
|     def _is_content_page(self) -> bool: |     def _is_content_page(self) -> bool: | ||||||
|         if link := self.get_permalink(): |         link = self._soup.find(id="current_perma_link") | ||||||
|             return "target=copa_" in link |         if not link: | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     def _is_learning_module_page(self) -> bool: |  | ||||||
|         if link := self.get_permalink(): |  | ||||||
|             return "target=pg_" in link |  | ||||||
|             return False |             return False | ||||||
|  |         return "target=copa_" in link.get("value") | ||||||
|  |  | ||||||
|     def _contains_collapsed_future_meetings(self) -> bool: |     def _contains_collapsed_future_meetings(self) -> bool: | ||||||
|         return self._uncollapse_future_meetings_url() is not None |         return self._uncollapse_future_meetings_url() is not None | ||||||
|  |  | ||||||
|     def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: |     def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: | ||||||
|         element = self._soup.find( |         element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) | ||||||
|             "a", |  | ||||||
|             attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)} |  | ||||||
|         ) |  | ||||||
|         if not element: |         if not element: | ||||||
|             return None |             return None | ||||||
|         link = self._abs_url_from_link(element) |         link = self._abs_url_from_link(element) | ||||||
|         return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") |         return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") | ||||||
|  |  | ||||||
|     def _is_content_tab_selected(self) -> bool: |  | ||||||
|         return self._select_content_page_url() is None |  | ||||||
|  |  | ||||||
|     def _is_info_tab(self) -> bool: |  | ||||||
|         might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None |  | ||||||
|         return self._page_type == IliasElementType.INFO_TAB and might_be_info |  | ||||||
|  |  | ||||||
|     def _select_content_page_url(self) -> Optional[IliasPageElement]: |  | ||||||
|         tab = self._soup.find( |  | ||||||
|             id="tab_view_content", |  | ||||||
|             attrs={"class": lambda x: x is not None and "active" not in x} |  | ||||||
|         ) |  | ||||||
|         # Already selected (or not found) |  | ||||||
|         if not tab: |  | ||||||
|             return None |  | ||||||
|         link = tab.find("a") |  | ||||||
|         if link: |  | ||||||
|             link = self._abs_url_from_link(link) |  | ||||||
|             return IliasPageElement(IliasElementType.FOLDER, link, "select content page") |  | ||||||
|  |  | ||||||
|         _unexpected_html_warning() |  | ||||||
|         log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") |  | ||||||
|         log.warn_contd("PFERD might not find content on the course's main page.") |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def _player_to_video(self) -> List[IliasPageElement]: |     def _player_to_video(self) -> List[IliasPageElement]: | ||||||
|         # Fetch the actual video page. This is a small wrapper page initializing a javscript |         # Fetch the actual video page. This is a small wrapper page initializing a javscript | ||||||
|         # player. Sadly we can not execute that JS. The actual video stream url is nowhere |         # player. Sadly we can not execute that JS. The actual video stream url is nowhere | ||||||
| @@ -345,14 +242,14 @@ class IliasPage: | |||||||
|         # and just fetch the lone video url! |         # and just fetch the lone video url! | ||||||
|         if len(streams) == 1: |         if len(streams) == 1: | ||||||
|             video_url = streams[0]["sources"]["mp4"][0]["src"] |             video_url = streams[0]["sources"]["mp4"][0]["src"] | ||||||
|             return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)] |             return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] | ||||||
|  |  | ||||||
|         log.explain(f"Found multiple videos for stream at {self._source_name}") |         log.explain(f"Found multiple videos for stream at {self._source_name}") | ||||||
|         items = [] |         items = [] | ||||||
|         for stream in sorted(streams, key=lambda stream: stream["content"]): |         for stream in sorted(streams, key=lambda stream: stream["content"]): | ||||||
|             full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" |             full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" | ||||||
|             video_url = stream["sources"]["mp4"][0]["src"] |             video_url = stream["sources"]["mp4"][0]["src"] | ||||||
|             items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) |             items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) | ||||||
|  |  | ||||||
|         return items |         return items | ||||||
|  |  | ||||||
| @@ -378,10 +275,6 @@ class IliasPage: | |||||||
|             name = _sanitize_path_name(link.text.strip()) |             name = _sanitize_path_name(link.text.strip()) | ||||||
|             url = self._abs_url_from_link(link) |             url = self._abs_url_from_link(link) | ||||||
|  |  | ||||||
|             if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url: |  | ||||||
|                 # Configure button/link does not have anything interesting |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             type = self._find_type_from_link(name, link, url) |             type = self._find_type_from_link(name, link, url) | ||||||
|             if not type: |             if not type: | ||||||
|                 _unexpected_html_warning() |                 _unexpected_html_warning() | ||||||
| @@ -404,8 +297,7 @@ class IliasPage: | |||||||
|  |  | ||||||
|         for link in links: |         for link in links: | ||||||
|             url = self._abs_url_from_link(link) |             url = self._abs_url_from_link(link) | ||||||
|             name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.getText()).strip().replace("\t", "") |             name = _sanitize_path_name(link.getText().strip().replace("\t", "")) | ||||||
|             name = _sanitize_path_name(name) |  | ||||||
|  |  | ||||||
|             if "file_id" not in url: |             if "file_id" not in url: | ||||||
|                 _unexpected_html_warning() |                 _unexpected_html_warning() | ||||||
| @@ -416,24 +308,7 @@ class IliasPage: | |||||||
|  |  | ||||||
|         return items |         return items | ||||||
|  |  | ||||||
|     def _find_info_tab_entries(self) -> List[IliasPageElement]: |     def _find_video_entries(self) -> List[IliasPageElement]: | ||||||
|         items = [] |  | ||||||
|         links: List[Tag] = self._soup.select("a.il_ContainerItemCommand") |  | ||||||
|  |  | ||||||
|         for link in links: |  | ||||||
|             if "cmdClass=ilobjcoursegui" not in link["href"]: |  | ||||||
|                 continue |  | ||||||
|             if "cmd=sendfile" not in link["href"]: |  | ||||||
|                 continue |  | ||||||
|             items.append(IliasPageElement( |  | ||||||
|                 IliasElementType.FILE, |  | ||||||
|                 self._abs_url_from_link(link), |  | ||||||
|                 _sanitize_path_name(link.getText()) |  | ||||||
|             )) |  | ||||||
|  |  | ||||||
|         return items |  | ||||||
|  |  | ||||||
|     def _find_opencast_video_entries(self) -> List[IliasPageElement]: |  | ||||||
|         # ILIAS has three stages for video pages |         # ILIAS has three stages for video pages | ||||||
|         # 1. The initial dummy page without any videos. This page contains the link to the listing |         # 1. The initial dummy page without any videos. This page contains the link to the listing | ||||||
|         # 2. The video listing which might be paginated |         # 2. The video listing which might be paginated | ||||||
| @@ -453,27 +328,27 @@ class IliasPage: | |||||||
|             query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} |             query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} | ||||||
|             url = url_set_query_params(url, query_params) |             url = url_set_query_params(url, query_params) | ||||||
|             log.explain("Found ILIAS video frame page, fetching actual content next") |             log.explain("Found ILIAS video frame page, fetching actual content next") | ||||||
|             return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] |             return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] | ||||||
|  |  | ||||||
|         is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None |         is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None | ||||||
|  |  | ||||||
|         if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER: |         if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: | ||||||
|             # We are in stage 2 - try to break pagination |             # We are in stage 2 - try to break pagination | ||||||
|             return self._find_opencast_video_entries_paginated() |             return self._find_video_entries_paginated() | ||||||
|  |  | ||||||
|         return self._find_opencast_video_entries_no_paging() |         return self._find_video_entries_no_paging() | ||||||
|  |  | ||||||
|     def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: |     def _find_video_entries_paginated(self) -> List[IliasPageElement]: | ||||||
|         table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) |         table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) | ||||||
|  |  | ||||||
|         if table_element is None: |         if table_element is None: | ||||||
|             log.warn("Couldn't increase elements per page (table not found). I might miss elements.") |             log.warn("Couldn't increase elements per page (table not found). I might miss elements.") | ||||||
|             return self._find_opencast_video_entries_no_paging() |             return self._find_video_entries_no_paging() | ||||||
|  |  | ||||||
|         id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) |         id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) | ||||||
|         if id_match is None: |         if id_match is None: | ||||||
|             log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") |             log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") | ||||||
|             return self._find_opencast_video_entries_no_paging() |             return self._find_video_entries_no_paging() | ||||||
|  |  | ||||||
|         table_id = id_match.group(1) |         table_id = id_match.group(1) | ||||||
|  |  | ||||||
| @@ -482,25 +357,25 @@ class IliasPage: | |||||||
|         url = url_set_query_params(self._page_url, query_params) |         url = url_set_query_params(self._page_url, query_params) | ||||||
|  |  | ||||||
|         log.explain("Disabled pagination, retrying folder as a new entry") |         log.explain("Disabled pagination, retrying folder as a new entry") | ||||||
|         return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] |         return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] | ||||||
|  |  | ||||||
|     def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: |     def _find_video_entries_no_paging(self) -> List[IliasPageElement]: | ||||||
|         """ |         """ | ||||||
|         Crawls the "second stage" video page. This page contains the actual video urls. |         Crawls the "second stage" video page. This page contains the actual video urls. | ||||||
|         """ |         """ | ||||||
|         # Video start links are marked with an "Abspielen" link |         # Video start links are marked with an "Abspielen" link | ||||||
|         video_links: List[Tag] = self._soup.findAll( |         video_links: List[Tag] = self._soup.findAll( | ||||||
|             name="a", text=re.compile(r"\s*(Abspielen|Play)\s*") |             name="a", text=re.compile(r"\s*Abspielen\s*") | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         results: List[IliasPageElement] = [] |         results: List[IliasPageElement] = [] | ||||||
|  |  | ||||||
|         for link in video_links: |         for link in video_links: | ||||||
|             results.append(self._listed_opencast_video_to_element(link)) |             results.append(self._listed_video_to_element(link)) | ||||||
|  |  | ||||||
|         return results |         return results | ||||||
|  |  | ||||||
|     def _listed_opencast_video_to_element(self, link: Tag) -> IliasPageElement: |     def _listed_video_to_element(self, link: Tag) -> IliasPageElement: | ||||||
|         # The link is part of a table with multiple columns, describing metadata. |         # The link is part of a table with multiple columns, describing metadata. | ||||||
|         # 6th or 7th child (1 indexed) is the modification time string. Try to find it |         # 6th or 7th child (1 indexed) is the modification time string. Try to find it | ||||||
|         # by parsing backwards from the end and finding something that looks like a date |         # by parsing backwards from the end and finding something that looks like a date | ||||||
| @@ -511,8 +386,8 @@ class IliasPage: | |||||||
|             modification_string = link.parent.parent.parent.select_one( |             modification_string = link.parent.parent.parent.select_one( | ||||||
|                 f"td.std:nth-child({index})" |                 f"td.std:nth-child({index})" | ||||||
|             ).getText().strip() |             ).getText().strip() | ||||||
|             if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string): |             if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): | ||||||
|                 modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M") |                 modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") | ||||||
|                 break |                 break | ||||||
|  |  | ||||||
|         if modification_time is None: |         if modification_time is None: | ||||||
| @@ -527,9 +402,7 @@ class IliasPage: | |||||||
|         video_url = self._abs_url_from_link(link) |         video_url = self._abs_url_from_link(link) | ||||||
|  |  | ||||||
|         log.explain(f"Found video {video_name!r} at {video_url}") |         log.explain(f"Found video {video_name!r} at {video_url}") | ||||||
|         return IliasPageElement( |         return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) | ||||||
|             IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def _find_exercise_entries(self) -> List[IliasPageElement]: |     def _find_exercise_entries(self) -> List[IliasPageElement]: | ||||||
|         if self._soup.find(id="tab_submission"): |         if self._soup.find(id="tab_submission"): | ||||||
| @@ -611,7 +484,7 @@ class IliasPage: | |||||||
|             file_listings: List[Tag] = container.findAll( |             file_listings: List[Tag] = container.findAll( | ||||||
|                 name="a", |                 name="a", | ||||||
|                 # download links contain the given command class |                 # download links contain the given command class | ||||||
|                 attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()} |                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|             # Add each listing as a new |             # Add each listing as a new | ||||||
| @@ -672,48 +545,9 @@ class IliasPage: | |||||||
|             result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) |             result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) | ||||||
|  |  | ||||||
|         result += self._find_cards() |         result += self._find_cards() | ||||||
|         result += self._find_mediacast_videos() |  | ||||||
|  |  | ||||||
|         return result |         return result | ||||||
|  |  | ||||||
|     def _find_mediacast_videos(self) -> List[IliasPageElement]: |  | ||||||
|         videos: List[IliasPageElement] = [] |  | ||||||
|  |  | ||||||
|         for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): |  | ||||||
|             element_name = _sanitize_path_name( |  | ||||||
|                 elem.select_one(".ilPlayerPreviewDescription").getText().strip() |  | ||||||
|             ) |  | ||||||
|             if not element_name.endswith(".mp4"): |  | ||||||
|                 # just to make sure it has some kinda-alrightish ending |  | ||||||
|                 element_name = element_name + ".mp4" |  | ||||||
|             video_element = elem.find(name="video") |  | ||||||
|             if not video_element: |  | ||||||
|                 _unexpected_html_warning() |  | ||||||
|                 log.warn_contd(f"No <video> element found for mediacast video '{element_name}'") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             videos.append(IliasPageElement( |  | ||||||
|                 type=IliasElementType.MEDIACAST_VIDEO, |  | ||||||
|                 url=self._abs_url_from_relative(video_element.get("src")), |  | ||||||
|                 name=element_name, |  | ||||||
|                 mtime=self._find_mediacast_video_mtime(elem.findParent(name="td")) |  | ||||||
|             )) |  | ||||||
|  |  | ||||||
|         return videos |  | ||||||
|  |  | ||||||
|     def _find_mediacast_video_mtime(self, enclosing_td: Tag) -> Optional[datetime]: |  | ||||||
|         description_td: Tag = enclosing_td.findPreviousSibling("td") |  | ||||||
|         if not description_td: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         meta_tag: Tag = description_td.find_all("p")[-1] |  | ||||||
|         if not meta_tag: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         updated_str = meta_tag.getText().strip().replace("\n", " ") |  | ||||||
|         updated_str = re.sub(".+?: ", "", updated_str) |  | ||||||
|         return demangle_date(updated_str) |  | ||||||
|  |  | ||||||
|     def _is_in_expanded_meeting(self, tag: Tag) -> bool: |     def _is_in_expanded_meeting(self, tag: Tag) -> bool: | ||||||
|         """ |         """ | ||||||
|         Returns whether a file is part of an expanded meeting. |         Returns whether a file is part of an expanded meeting. | ||||||
| @@ -850,11 +684,7 @@ class IliasPage: | |||||||
|                 "div", |                 "div", | ||||||
|                 attrs={"class": lambda x: x and "caption" in x}, |                 attrs={"class": lambda x: x and "caption" in x}, | ||||||
|             ) |             ) | ||||||
|             caption_container = caption_parent.find_next_sibling("div") |             description = caption_parent.find_next_sibling("div").getText().strip() | ||||||
|             if caption_container: |  | ||||||
|                 description = caption_container.getText().strip() |  | ||||||
|             else: |  | ||||||
|                 description = None |  | ||||||
|  |  | ||||||
|             if not type: |             if not type: | ||||||
|                 _unexpected_html_warning() |                 _unexpected_html_warning() | ||||||
| @@ -884,8 +714,8 @@ class IliasPage: | |||||||
|  |  | ||||||
|         icon: Tag = card_root.select_one(".il-card-repository-head .icon") |         icon: Tag = card_root.select_one(".il-card-repository-head .icon") | ||||||
|  |  | ||||||
|         if "opencast" in icon["class"] or "xoct" in icon["class"]: |         if "opencast" in icon["class"]: | ||||||
|             return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED |             return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED | ||||||
|         if "exc" in icon["class"]: |         if "exc" in icon["class"]: | ||||||
|             return IliasElementType.EXERCISE |             return IliasElementType.EXERCISE | ||||||
|         if "webr" in icon["class"]: |         if "webr" in icon["class"]: | ||||||
| @@ -900,14 +730,6 @@ class IliasPage: | |||||||
|             return IliasElementType.TEST |             return IliasElementType.TEST | ||||||
|         if "fold" in icon["class"]: |         if "fold" in icon["class"]: | ||||||
|             return IliasElementType.FOLDER |             return IliasElementType.FOLDER | ||||||
|         if "copa" in icon["class"]: |  | ||||||
|             return IliasElementType.FOLDER |  | ||||||
|         if "svy" in icon["class"]: |  | ||||||
|             return IliasElementType.SURVEY |  | ||||||
|         if "file" in icon["class"]: |  | ||||||
|             return IliasElementType.FILE |  | ||||||
|         if "mcst" in icon["class"]: |  | ||||||
|             return IliasElementType.MEDIACAST_VIDEO_FOLDER |  | ||||||
|  |  | ||||||
|         _unexpected_html_warning() |         _unexpected_html_warning() | ||||||
|         log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") |         log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") | ||||||
| @@ -946,15 +768,6 @@ class IliasPage: | |||||||
|         if "cmdClass=ilobjtestgui" in parsed_url.query: |         if "cmdClass=ilobjtestgui" in parsed_url.query: | ||||||
|             return IliasElementType.TEST |             return IliasElementType.TEST | ||||||
|  |  | ||||||
|         if "baseClass=ilLMPresentationGUI" in parsed_url.query: |  | ||||||
|             return IliasElementType.LEARNING_MODULE |  | ||||||
|  |  | ||||||
|         if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query: |  | ||||||
|             return IliasElementType.MEDIACAST_VIDEO_FOLDER |  | ||||||
|  |  | ||||||
|         if "baseClass=ilSAHSPresentationGUI" in parsed_url.query: |  | ||||||
|             return IliasElementType.SCORM_LEARNING_MODULE |  | ||||||
|  |  | ||||||
|         # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so |         # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so | ||||||
|         # try to guess it from the image. |         # try to guess it from the image. | ||||||
|  |  | ||||||
| @@ -996,11 +809,7 @@ class IliasPage: | |||||||
|         if img_tag is None: |         if img_tag is None: | ||||||
|             img_tag = found_parent.select_one("img.icon") |             img_tag = found_parent.select_one("img.icon") | ||||||
|  |  | ||||||
|         is_session_expansion_button = found_parent.find( |         if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): | ||||||
|             "a", |  | ||||||
|             attrs={"href": lambda x: x and ("crs_next_sess=" in x or "crs_prev_sess=" in x)} |  | ||||||
|         ) |  | ||||||
|         if img_tag is None and is_session_expansion_button: |  | ||||||
|             log.explain("Found session expansion button, skipping it as it has no content") |             log.explain("Found session expansion button, skipping it as it has no content") | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
| @@ -1010,7 +819,7 @@ class IliasPage: | |||||||
|             return None |             return None | ||||||
|  |  | ||||||
|         if "opencast" in str(img_tag["alt"]).lower(): |         if "opencast" in str(img_tag["alt"]).lower(): | ||||||
|             return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED |             return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_exc.svg"): |         if str(img_tag["src"]).endswith("icon_exc.svg"): | ||||||
|             return IliasElementType.EXERCISE |             return IliasElementType.EXERCISE | ||||||
| @@ -1030,12 +839,6 @@ class IliasPage: | |||||||
|         if str(img_tag["src"]).endswith("icon_tst.svg"): |         if str(img_tag["src"]).endswith("icon_tst.svg"): | ||||||
|             return IliasElementType.TEST |             return IliasElementType.TEST | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_mcst.svg"): |  | ||||||
|             return IliasElementType.MEDIACAST_VIDEO_FOLDER |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_sahs.svg"): |  | ||||||
|             return IliasElementType.SCORM_LEARNING_MODULE |  | ||||||
|  |  | ||||||
|         return IliasElementType.FOLDER |         return IliasElementType.FOLDER | ||||||
|  |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
| @@ -1065,45 +868,6 @@ class IliasPage: | |||||||
|         rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) |         rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) | ||||||
|         return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name |         return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def is_logged_in(soup: BeautifulSoup) -> bool: |  | ||||||
|         # Normal ILIAS pages |  | ||||||
|         mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") |  | ||||||
|         if mainbar is not None: |  | ||||||
|             login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) |  | ||||||
|             shib_login = soup.find(id="button_shib_login") |  | ||||||
|             return not login_button and not shib_login |  | ||||||
|  |  | ||||||
|         # Personal Desktop |  | ||||||
|         if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): |  | ||||||
|             return True |  | ||||||
|  |  | ||||||
|         # Empty personal desktop has zero (0) markers. Match on the text... |  | ||||||
|         if alert := soup.select_one(".alert-info"): |  | ||||||
|             text = alert.getText().lower() |  | ||||||
|             if "you have not yet selected any favourites" in text: |  | ||||||
|                 return True |  | ||||||
|             if "sie haben aktuell noch keine favoriten ausgewählt" in text: |  | ||||||
|                 return True |  | ||||||
|  |  | ||||||
|         # Video listing embeds do not have complete ILIAS html. Try to match them by |  | ||||||
|         # their video listing table |  | ||||||
|         video_table = soup.find( |  | ||||||
|             recursive=True, |  | ||||||
|             name="table", |  | ||||||
|             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} |  | ||||||
|         ) |  | ||||||
|         if video_table is not None: |  | ||||||
|             return True |  | ||||||
|         # The individual video player wrapper page has nothing of the above. |  | ||||||
|         # Match it by its playerContainer. |  | ||||||
|         if soup.select_one("#playerContainer") is not None: |  | ||||||
|             return True |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     def get_permalink(self) -> Optional[str]: |  | ||||||
|         return IliasPage.get_soup_permalink(self._soup) |  | ||||||
|  |  | ||||||
|     def _abs_url_from_link(self, link_tag: Tag) -> str: |     def _abs_url_from_link(self, link_tag: Tag) -> str: | ||||||
|         """ |         """ | ||||||
|         Create an absolute url from an <a> tag. |         Create an absolute url from an <a> tag. | ||||||
| @@ -1116,13 +880,6 @@ class IliasPage: | |||||||
|         """ |         """ | ||||||
|         return urljoin(self._page_url, relative_url) |         return urljoin(self._page_url, relative_url) | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]: |  | ||||||
|         perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a") |  | ||||||
|         if not perma_link_element or not perma_link_element.get("href"): |  | ||||||
|             return None |  | ||||||
|         return perma_link_element.get("href") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _unexpected_html_warning() -> None: | def _unexpected_html_warning() -> None: | ||||||
|     log.warn("Encountered unexpected HTML structure, ignoring element.") |     log.warn("Encountered unexpected HTML structure, ignoring element.") | ||||||
|   | |||||||
| @@ -1,11 +1,8 @@ | |||||||
| import asyncio | import asyncio | ||||||
| import base64 |  | ||||||
| import os |  | ||||||
| import re | import re | ||||||
| from collections.abc import Awaitable, Coroutine | from collections.abc import Awaitable, Coroutine | ||||||
| from pathlib import PurePath | from pathlib import PurePath | ||||||
| from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast | from typing import Any, Callable, Dict, List, Optional, Set, Union, cast | ||||||
| from urllib.parse import urljoin |  | ||||||
|  |  | ||||||
| import aiohttp | import aiohttp | ||||||
| import yarl | import yarl | ||||||
| @@ -19,10 +16,10 @@ from ...output_dir import FileSink, Redownload | |||||||
| from ...utils import fmt_path, soupify, url_set_query_param | from ...utils import fmt_path, soupify, url_set_query_param | ||||||
| from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical | from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical | ||||||
| from ..http_crawler import HttpCrawler, HttpCrawlerSection | from ..http_crawler import HttpCrawler, HttpCrawlerSection | ||||||
| from .file_templates import Links, learning_module_template | from .file_templates import Links | ||||||
| from .ilias_html_cleaner import clean, insert_base_markup | from .ilias_html_cleaner import clean, insert_base_markup | ||||||
| from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, | from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, | ||||||
|                              IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) |                              _sanitize_path_name, parse_ilias_forum_export) | ||||||
|  |  | ||||||
| TargetType = Union[str, int] | TargetType = Union[str, int] | ||||||
|  |  | ||||||
| @@ -81,25 +78,21 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): | |||||||
|         return self.s.getboolean("forums", fallback=False) |         return self.s.getboolean("forums", fallback=False) | ||||||
|  |  | ||||||
|  |  | ||||||
| _DIRECTORY_PAGES: Set[IliasElementType] = { | _DIRECTORY_PAGES: Set[IliasElementType] = set([ | ||||||
|     IliasElementType.EXERCISE, |     IliasElementType.EXERCISE, | ||||||
|     IliasElementType.EXERCISE_FILES, |     IliasElementType.EXERCISE_FILES, | ||||||
|     IliasElementType.FOLDER, |     IliasElementType.FOLDER, | ||||||
|     IliasElementType.INFO_TAB, |  | ||||||
|     IliasElementType.MEETING, |     IliasElementType.MEETING, | ||||||
|     IliasElementType.MEDIACAST_VIDEO_FOLDER, |     IliasElementType.VIDEO_FOLDER, | ||||||
|     IliasElementType.OPENCAST_VIDEO_FOLDER, |     IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, | ||||||
|     IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, | ]) | ||||||
| } |  | ||||||
|  |  | ||||||
| _VIDEO_ELEMENTS: Set[IliasElementType] = { | _VIDEO_ELEMENTS: Set[IliasElementType] = set([ | ||||||
|     IliasElementType.MEDIACAST_VIDEO_FOLDER, |     IliasElementType.VIDEO, | ||||||
|     IliasElementType.MEDIACAST_VIDEO, |     IliasElementType.VIDEO_PLAYER, | ||||||
|     IliasElementType.OPENCAST_VIDEO, |     IliasElementType.VIDEO_FOLDER, | ||||||
|     IliasElementType.OPENCAST_VIDEO_PLAYER, |     IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, | ||||||
|     IliasElementType.OPENCAST_VIDEO_FOLDER, | ]) | ||||||
|     IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: | ||||||
| @@ -130,7 +123,6 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla | |||||||
|             raise CrawlError("Impossible return in ilias _iorepeat") |             raise CrawlError("Impossible return in ilias _iorepeat") | ||||||
|  |  | ||||||
|         return wrapper  # type: ignore |         return wrapper  # type: ignore | ||||||
|  |  | ||||||
|     return decorator |     return decorator | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -141,10 +133,6 @@ def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]: | |||||||
|     return _iorepeat(1, name) |     return _iorepeat(1, name) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _get_video_cache_key(element: IliasPageElement) -> str: |  | ||||||
|     return f"ilias-video-cache-{element.id()}" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Crawler control flow: | # Crawler control flow: | ||||||
| # | # | ||||||
| #     crawl_desktop -+ | #     crawl_desktop -+ | ||||||
| @@ -206,7 +194,7 @@ instance's greatest bottleneck. | |||||||
|         self._links = section.links() |         self._links = section.links() | ||||||
|         self._videos = section.videos() |         self._videos = section.videos() | ||||||
|         self._forums = section.forums() |         self._forums = section.forums() | ||||||
|         self._visited_urls: Dict[str, PurePath] = dict() |         self._visited_urls: Set[str] = set() | ||||||
|  |  | ||||||
|     async def _run(self) -> None: |     async def _run(self) -> None: | ||||||
|         if isinstance(self._target, int): |         if isinstance(self._target, int): | ||||||
| @@ -228,7 +216,7 @@ instance's greatest bottleneck. | |||||||
|         await self._crawl_url(root_url, expected_id=course_id) |         await self._crawl_url(root_url, expected_id=course_id) | ||||||
|  |  | ||||||
|     async def _crawl_desktop(self) -> None: |     async def _crawl_desktop(self) -> None: | ||||||
|         appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items" |         appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" | ||||||
|         appendix = appendix.encode("ASCII").hex() |         appendix = appendix.encode("ASCII").hex() | ||||||
|         await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) |         await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) | ||||||
|  |  | ||||||
| @@ -251,11 +239,11 @@ instance's greatest bottleneck. | |||||||
|  |  | ||||||
|                 # Duplicated code, but the root page is special - we want to avoid fetching it twice! |                 # Duplicated code, but the root page is special - we want to avoid fetching it twice! | ||||||
|                 while next_stage_url: |                 while next_stage_url: | ||||||
|                     soup = await self._get_page(next_stage_url, root_page_allowed=True) |                     soup = await self._get_page(next_stage_url) | ||||||
|  |  | ||||||
|                     if current_parent is None and expected_id is not None: |                     if current_parent is None and expected_id is not None: | ||||||
|                         perma_link = IliasPage.get_soup_permalink(soup) |                         perma_link_element: Tag = soup.find(id="current_perma_link") | ||||||
|                         if not perma_link or "crs_" not in perma_link: |                         if not perma_link_element or "crs_" not in perma_link_element.get("value"): | ||||||
|                             raise CrawlError("Invalid course id? Didn't find anything looking like a course") |                             raise CrawlError("Invalid course id? Didn't find anything looking like a course") | ||||||
|  |  | ||||||
|                     log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") |                     log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") | ||||||
| @@ -268,8 +256,6 @@ instance's greatest bottleneck. | |||||||
|                         next_stage_url = None |                         next_stage_url = None | ||||||
|  |  | ||||||
|                 elements.extend(page.get_child_elements()) |                 elements.extend(page.get_child_elements()) | ||||||
|                 if info_tab := page.get_info_tab(): |  | ||||||
|                     elements.append(info_tab) |  | ||||||
|                 if description_string := page.get_description(): |                 if description_string := page.get_description(): | ||||||
|                     description.append(description_string) |                     description.append(description_string) | ||||||
|  |  | ||||||
| @@ -362,11 +348,9 @@ instance's greatest bottleneck. | |||||||
|     ) -> Optional[Coroutine[Any, Any, None]]: |     ) -> Optional[Coroutine[Any, Any, None]]: | ||||||
|         if element.url in self._visited_urls: |         if element.url in self._visited_urls: | ||||||
|             raise CrawlWarning( |             raise CrawlWarning( | ||||||
|                 f"Found second path to element {element.name!r} at {element.url!r}. " |                 f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" | ||||||
|                 + f"First path: {fmt_path(self._visited_urls[element.url])}. " |  | ||||||
|                 + f"Second path: {fmt_path(parent_path)}." |  | ||||||
|             ) |             ) | ||||||
|         self._visited_urls[element.url] = parent_path |         self._visited_urls.add(element.url) | ||||||
|  |  | ||||||
|         element_path = PurePath(parent_path, element.name) |         element_path = PurePath(parent_path, element.name) | ||||||
|  |  | ||||||
| @@ -393,41 +377,18 @@ instance's greatest bottleneck. | |||||||
|                 return None |                 return None | ||||||
|             return await self._handle_forum(element, element_path) |             return await self._handle_forum(element, element_path) | ||||||
|         elif element.type == IliasElementType.TEST: |         elif element.type == IliasElementType.TEST: | ||||||
|             log.status( |             log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") | ||||||
|                 "[bold bright_black]", |             log.explain("Tests contain no relevant files") | ||||||
|                 "Ignored", |             log.explain("Answer: No") | ||||||
|                 fmt_path(element_path), |  | ||||||
|                 "[bright_black](tests contain no relevant data)" |  | ||||||
|             ) |  | ||||||
|             return None |             return None | ||||||
|         elif element.type == IliasElementType.SURVEY: |  | ||||||
|             log.status( |  | ||||||
|                 "[bold bright_black]", |  | ||||||
|                 "Ignored", |  | ||||||
|                 fmt_path(element_path), |  | ||||||
|                 "[bright_black](surveys contain no relevant data)" |  | ||||||
|             ) |  | ||||||
|             return None |  | ||||||
|         elif element.type == IliasElementType.SCORM_LEARNING_MODULE: |  | ||||||
|             log.status( |  | ||||||
|                 "[bold bright_black]", |  | ||||||
|                 "Ignored", |  | ||||||
|                 fmt_path(element_path), |  | ||||||
|                 "[bright_black](scorm learning modules are not supported)" |  | ||||||
|             ) |  | ||||||
|             return None |  | ||||||
|         elif element.type == IliasElementType.LEARNING_MODULE: |  | ||||||
|             return await self._handle_learning_module(element, element_path) |  | ||||||
|         elif element.type == IliasElementType.LINK: |         elif element.type == IliasElementType.LINK: | ||||||
|             return await self._handle_link(element, element_path) |             return await self._handle_link(element, element_path) | ||||||
|         elif element.type == IliasElementType.BOOKING: |         elif element.type == IliasElementType.BOOKING: | ||||||
|             return await self._handle_booking(element, element_path) |             return await self._handle_booking(element, element_path) | ||||||
|         elif element.type == IliasElementType.OPENCAST_VIDEO: |         elif element.type == IliasElementType.VIDEO: | ||||||
|             return await self._handle_file(element, element_path) |  | ||||||
|         elif element.type == IliasElementType.OPENCAST_VIDEO_PLAYER: |  | ||||||
|             return await self._handle_opencast_video(element, element_path) |  | ||||||
|         elif element.type == IliasElementType.MEDIACAST_VIDEO: |  | ||||||
|             return await self._handle_file(element, element_path) |             return await self._handle_file(element, element_path) | ||||||
|  |         elif element.type == IliasElementType.VIDEO_PLAYER: | ||||||
|  |             return await self._handle_video(element, element_path) | ||||||
|         elif element.type in _DIRECTORY_PAGES: |         elif element.type in _DIRECTORY_PAGES: | ||||||
|             return await self._handle_ilias_page(element.url, element, element_path) |             return await self._handle_ilias_page(element.url, element, element_path) | ||||||
|         else: |         else: | ||||||
| @@ -544,7 +505,7 @@ instance's greatest bottleneck. | |||||||
|  |  | ||||||
|         raise CrawlError("resolve_link_target failed even after authenticating") |         raise CrawlError("resolve_link_target failed even after authenticating") | ||||||
|  |  | ||||||
|     async def _handle_opencast_video( |     async def _handle_video( | ||||||
|         self, |         self, | ||||||
|         element: IliasPageElement, |         element: IliasPageElement, | ||||||
|         element_path: PurePath, |         element_path: PurePath, | ||||||
| @@ -552,8 +513,8 @@ instance's greatest bottleneck. | |||||||
|         # Copy old mapping as it is likely still relevant |         # Copy old mapping as it is likely still relevant | ||||||
|         if self.prev_report: |         if self.prev_report: | ||||||
|             self.report.add_custom_value( |             self.report.add_custom_value( | ||||||
|                 _get_video_cache_key(element), |                 str(element_path), | ||||||
|                 self.prev_report.get_custom_value(_get_video_cache_key(element)) |                 self.prev_report.get_custom_value(str(element_path)) | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|         # A video might contain other videos, so let's "crawl" the video first |         # A video might contain other videos, so let's "crawl" the video first | ||||||
| @@ -563,69 +524,58 @@ instance's greatest bottleneck. | |||||||
|         # to ensure backwards compatibility. |         # to ensure backwards compatibility. | ||||||
|         maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) |         maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) | ||||||
|  |  | ||||||
|         # If we do not want to crawl it (user filter), we can move on |         # If we do not want to crawl it (user filter) or we have every file | ||||||
|         if not maybe_dl: |         # from the cached mapping already, we can ignore this and bail | ||||||
|             return None |         if not maybe_dl or self._all_videos_locally_present(element_path): | ||||||
|  |             # Mark all existing cideos as known so they do not get deleted | ||||||
|         # If we have every file from the cached mapping already, we can ignore this and bail |             # during dleanup. We "downloaded" them, just without actually making | ||||||
|         if self._all_opencast_videos_locally_present(element, maybe_dl.path): |             # a network request as we assumed they did not change. | ||||||
|             # Mark all existing videos as known to ensure they do not get deleted during cleanup. |             for video in self._previous_contained_videos(element_path): | ||||||
|             # We "downloaded" them, just without actually making a network request as we assumed |  | ||||||
|             # they did not change. |  | ||||||
|             contained = self._previous_contained_opencast_videos(element, maybe_dl.path) |  | ||||||
|             if len(contained) > 1: |  | ||||||
|                 # Only do this if we threw away the original dl token, |  | ||||||
|                 # to not download single-stream videos twice |  | ||||||
|                 for video in contained: |  | ||||||
|                 await self.download(video) |                 await self.download(video) | ||||||
|  |  | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
|         return self._download_opencast_video(element, maybe_dl) |         return self._download_video(element_path, element, maybe_dl) | ||||||
|  |  | ||||||
|     def _previous_contained_opencast_videos( |     def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]: | ||||||
|         self, element: IliasPageElement, element_path: PurePath |  | ||||||
|     ) -> List[PurePath]: |  | ||||||
|         if not self.prev_report: |         if not self.prev_report: | ||||||
|             return [] |             return [] | ||||||
|         custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element)) |         custom_value = self.prev_report.get_custom_value(str(video_path)) | ||||||
|         if not custom_value: |         if not custom_value: | ||||||
|             return [] |             return [] | ||||||
|         cached_value = cast(dict[str, Any], custom_value) |         names = cast(List[str], custom_value) | ||||||
|         if "known_paths" not in cached_value or "own_path" not in cached_value: |         folder = video_path.parent | ||||||
|             log.explain(f"'known_paths' or 'own_path' missing from cached value: {cached_value}") |         return [PurePath(folder, name) for name in names] | ||||||
|             return [] |  | ||||||
|         transformed_own_path = self._transformer.transform(element_path) |  | ||||||
|         if cached_value["own_path"] != str(transformed_own_path): |  | ||||||
|             log.explain( |  | ||||||
|                 f"own_path '{transformed_own_path}' does not match cached value: '{cached_value['own_path']}" |  | ||||||
|             ) |  | ||||||
|             return [] |  | ||||||
|         return [PurePath(name) for name in cached_value["known_paths"]] |  | ||||||
|  |  | ||||||
|     def _all_opencast_videos_locally_present(self, element: IliasPageElement, element_path: PurePath) -> bool: |     def _all_videos_locally_present(self, video_path: PurePath) -> bool: | ||||||
|         log.explain_topic(f"Checking local cache for video {fmt_path(element_path)}") |         if contained_videos := self._previous_contained_videos(video_path): | ||||||
|         if contained_videos := self._previous_contained_opencast_videos(element, element_path): |             log.explain_topic(f"Checking local cache for video {video_path.name}") | ||||||
|             log.explain( |             all_found_locally = True | ||||||
|                 f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}" |             for video in contained_videos: | ||||||
|             ) |                 transformed_path = self._to_local_video_path(video) | ||||||
|             if all(self._output_dir.resolve(path).exists() for path in contained_videos): |                 if transformed_path: | ||||||
|                 log.explain("Found all known videos locally, skipping enumeration request") |                     exists_locally = self._output_dir.resolve(transformed_path).exists() | ||||||
|  |                     all_found_locally = all_found_locally and exists_locally | ||||||
|  |             if all_found_locally: | ||||||
|  |                 log.explain("Found all videos locally, skipping enumeration request") | ||||||
|                 return True |                 return True | ||||||
|             log.explain("Missing at least one video, continuing with requests!") |             log.explain("Missing at least one video, continuing with requests!") | ||||||
|         else: |  | ||||||
|             log.explain("No local cache present") |  | ||||||
|         return False |         return False | ||||||
|  |  | ||||||
|  |     def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]: | ||||||
|  |         if transformed := self._transformer.transform(path): | ||||||
|  |             return self._deduplicator.fixup_path(transformed) | ||||||
|  |         return None | ||||||
|  |  | ||||||
|     @anoncritical |     @anoncritical | ||||||
|     @_iorepeat(3, "downloading video") |     @_iorepeat(3, "downloading video") | ||||||
|     async def _download_opencast_video(self, element: IliasPageElement, dl: DownloadToken) -> None: |     async def _download_video( | ||||||
|         def add_to_report(paths: list[str]) -> None: |         self, | ||||||
|             self.report.add_custom_value( |         original_path: PurePath, | ||||||
|                 _get_video_cache_key(element), |         element: IliasPageElement, | ||||||
|                 {"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))} |         dl: DownloadToken | ||||||
|             ) |     ) -> None: | ||||||
|  |         stream_elements: List[IliasPageElement] = [] | ||||||
|         async with dl as (bar, sink): |         async with dl as (bar, sink): | ||||||
|             page = IliasPage(await self._get_page(element.url), element.url, element) |             page = IliasPage(await self._get_page(element.url), element.url, element) | ||||||
|             stream_elements = page.get_child_elements() |             stream_elements = page.get_child_elements() | ||||||
| @@ -636,25 +586,32 @@ instance's greatest bottleneck. | |||||||
|                 log.explain(f"Using single video mode for {element.name}") |                 log.explain(f"Using single video mode for {element.name}") | ||||||
|                 stream_element = stream_elements[0] |                 stream_element = stream_elements[0] | ||||||
|  |  | ||||||
|  |                 transformed_path = self._to_local_video_path(original_path) | ||||||
|  |                 if not transformed_path: | ||||||
|  |                     raise CrawlError(f"Download returned a path but transform did not for {original_path}") | ||||||
|  |  | ||||||
|                 # We do not have a local cache yet |                 # We do not have a local cache yet | ||||||
|  |                 if self._output_dir.resolve(transformed_path).exists(): | ||||||
|  |                     log.explain(f"Video for {element.name} existed locally") | ||||||
|  |                 else: | ||||||
|                     await self._stream_from_url(stream_element.url, sink, bar, is_video=True) |                     await self._stream_from_url(stream_element.url, sink, bar, is_video=True) | ||||||
|                 add_to_report([str(self._transformer.transform(dl.path))]) |                 self.report.add_custom_value(str(original_path), [original_path.name]) | ||||||
|                 return |                 return | ||||||
|  |  | ||||||
|         contained_video_paths: List[str] = [] |         contained_video_paths: List[str] = [] | ||||||
|  |  | ||||||
|         for stream_element in stream_elements: |         for stream_element in stream_elements: | ||||||
|             video_path = dl.path.parent / stream_element.name |             video_path = original_path.parent / stream_element.name | ||||||
|  |             contained_video_paths.append(str(video_path)) | ||||||
|  |  | ||||||
|             maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) |             maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) | ||||||
|             if not maybe_dl: |             if not maybe_dl: | ||||||
|                 continue |                 continue | ||||||
|             async with maybe_dl as (bar, sink): |             async with maybe_dl as (bar, sink): | ||||||
|                 log.explain(f"Streaming video from real url {stream_element.url}") |                 log.explain(f"Streaming video from real url {stream_element.url}") | ||||||
|                 contained_video_paths.append(str(self._transformer.transform(maybe_dl.path))) |  | ||||||
|                 await self._stream_from_url(stream_element.url, sink, bar, is_video=True) |                 await self._stream_from_url(stream_element.url, sink, bar, is_video=True) | ||||||
|  |  | ||||||
|         add_to_report(contained_video_paths) |         self.report.add_custom_value(str(original_path), contained_video_paths) | ||||||
|  |  | ||||||
|     async def _handle_file( |     async def _handle_file( | ||||||
|         self, |         self, | ||||||
| @@ -666,8 +623,8 @@ instance's greatest bottleneck. | |||||||
|             return None |             return None | ||||||
|         return self._download_file(element, maybe_dl) |         return self._download_file(element, maybe_dl) | ||||||
|  |  | ||||||
|     @_iorepeat(3, "downloading file") |  | ||||||
|     @anoncritical |     @anoncritical | ||||||
|  |     @_iorepeat(3, "downloading file") | ||||||
|     async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: |     async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: | ||||||
|         assert dl  # The function is only reached when dl is not None |         assert dl  # The function is only reached when dl is not None | ||||||
|         async with dl as (bar, sink): |         async with dl as (bar, sink): | ||||||
| @@ -675,28 +632,12 @@ instance's greatest bottleneck. | |||||||
|  |  | ||||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: |     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: | ||||||
|         async def try_stream() -> bool: |         async def try_stream() -> bool: | ||||||
|             next_url = url |             async with self.session.get(url, allow_redirects=is_video) as resp: | ||||||
|  |  | ||||||
|             # Normal files redirect to the magazine if we are not authenticated. As files could be HTML, |  | ||||||
|             # we can not match on the content type here. Instead, we disallow redirects and inspect the |  | ||||||
|             # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume |  | ||||||
|             # our authentication expired. |  | ||||||
|                 if not is_video: |                 if not is_video: | ||||||
|                 async with self.session.get(url, allow_redirects=False) as resp: |                     # Redirect means we weren't authenticated | ||||||
|                     # Redirect to anything except a "sendfile" means we weren't authenticated |  | ||||||
|                     if hdrs.LOCATION in resp.headers: |                     if hdrs.LOCATION in resp.headers: | ||||||
|                         if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]: |  | ||||||
|                         return False |                         return False | ||||||
|                         # Directly follow the redirect to not make a second, unnecessary request |                 # we wanted a video but got HTML | ||||||
|                         next_url = resp.headers[hdrs.LOCATION] |  | ||||||
|  |  | ||||||
|             # Let's try this again and follow redirects |  | ||||||
|             return await fetch_follow_redirects(next_url) |  | ||||||
|  |  | ||||||
|         async def fetch_follow_redirects(file_url: str) -> bool: |  | ||||||
|             async with self.session.get(file_url) as resp: |  | ||||||
|                 # We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really |  | ||||||
|                 # solve that depending on the setup, but it is better than nothing. |  | ||||||
|                 if is_video and "html" in resp.content_type: |                 if is_video and "html" in resp.content_type: | ||||||
|                     return False |                     return False | ||||||
|  |  | ||||||
| @@ -741,7 +682,7 @@ instance's greatest bottleneck. | |||||||
|                 log.explain(f"URL: {next_stage_url}") |                 log.explain(f"URL: {next_stage_url}") | ||||||
|  |  | ||||||
|                 soup = await self._get_page(next_stage_url) |                 soup = await self._get_page(next_stage_url) | ||||||
|                 page = IliasPage(soup, next_stage_url, element) |                 page = IliasPage(soup, next_stage_url, None) | ||||||
|  |  | ||||||
|                 if next := page.get_next_stage_element(): |                 if next := page.get_next_stage_element(): | ||||||
|                     next_stage_url = next.url |                     next_stage_url = next.url | ||||||
| @@ -753,6 +694,7 @@ instance's greatest bottleneck. | |||||||
|                 raise CrawlWarning("Failed to extract forum data") |                 raise CrawlWarning("Failed to extract forum data") | ||||||
|             if download_data.empty: |             if download_data.empty: | ||||||
|                 log.explain("Forum had no threads") |                 log.explain("Forum had no threads") | ||||||
|  |                 elements = [] | ||||||
|                 return |                 return | ||||||
|             html = await self._post_authenticated(download_data.url, download_data.form_data) |             html = await self._post_authenticated(download_data.url, download_data.form_data) | ||||||
|             elements = parse_ilias_forum_export(soupify(html)) |             elements = parse_ilias_forum_export(soupify(html)) | ||||||
| @@ -784,142 +726,12 @@ instance's greatest bottleneck. | |||||||
|             sink.file.write(content.encode("utf-8")) |             sink.file.write(content.encode("utf-8")) | ||||||
|             sink.done() |             sink.done() | ||||||
|  |  | ||||||
|     async def _handle_learning_module( |     async def _get_page(self, url: str) -> BeautifulSoup: | ||||||
|         self, |  | ||||||
|         element: IliasPageElement, |  | ||||||
|         element_path: PurePath, |  | ||||||
|     ) -> Optional[Coroutine[Any, Any, None]]: |  | ||||||
|         maybe_cl = await self.crawl(element_path) |  | ||||||
|         if not maybe_cl: |  | ||||||
|             return None |  | ||||||
|         return self._crawl_learning_module(element, maybe_cl) |  | ||||||
|  |  | ||||||
|     @_iorepeat(3, "crawling learning module") |  | ||||||
|     @anoncritical |  | ||||||
|     async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None: |  | ||||||
|         elements: List[IliasLearningModulePage] = [] |  | ||||||
|  |  | ||||||
|         async with cl: |  | ||||||
|             log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") |  | ||||||
|             log.explain(f"URL: {element.url}") |  | ||||||
|             soup = await self._get_page(element.url) |  | ||||||
|             page = IliasPage(soup, element.url, element) |  | ||||||
|             if next := page.get_learning_module_data(): |  | ||||||
|                 elements.extend(await self._crawl_learning_module_direction( |  | ||||||
|                     cl.path, next.previous_url, "left", element |  | ||||||
|                 )) |  | ||||||
|                 elements.append(next) |  | ||||||
|                 elements.extend(await self._crawl_learning_module_direction( |  | ||||||
|                     cl.path, next.next_url, "right", element |  | ||||||
|                 )) |  | ||||||
|  |  | ||||||
|         # Reflect their natural ordering in the file names |  | ||||||
|         for index, lm_element in enumerate(elements): |  | ||||||
|             lm_element.title = f"{index:02}_{lm_element.title}" |  | ||||||
|  |  | ||||||
|         tasks: List[Awaitable[None]] = [] |  | ||||||
|         for index, elem in enumerate(elements): |  | ||||||
|             prev_url = elements[index - 1].title if index > 0 else None |  | ||||||
|             next_url = elements[index + 1].title if index < len(elements) - 1 else None |  | ||||||
|             tasks.append(asyncio.create_task( |  | ||||||
|                 self._download_learning_module_page(cl.path, elem, prev_url, next_url) |  | ||||||
|             )) |  | ||||||
|  |  | ||||||
|         # And execute them |  | ||||||
|         await self.gather(tasks) |  | ||||||
|  |  | ||||||
|     async def _crawl_learning_module_direction( |  | ||||||
|         self, |  | ||||||
|         path: PurePath, |  | ||||||
|         start_url: Optional[str], |  | ||||||
|         dir: Union[Literal["left"], Literal["right"]], |  | ||||||
|         parent_element: IliasPageElement |  | ||||||
|     ) -> List[IliasLearningModulePage]: |  | ||||||
|         elements: List[IliasLearningModulePage] = [] |  | ||||||
|  |  | ||||||
|         if not start_url: |  | ||||||
|             return elements |  | ||||||
|  |  | ||||||
|         next_element_url: Optional[str] = start_url |  | ||||||
|         counter = 0 |  | ||||||
|         while next_element_url: |  | ||||||
|             log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") |  | ||||||
|             log.explain(f"URL: {next_element_url}") |  | ||||||
|             soup = await self._get_page(next_element_url) |  | ||||||
|             page = IliasPage(soup, next_element_url, parent_element) |  | ||||||
|             if next := page.get_learning_module_data(): |  | ||||||
|                 elements.append(next) |  | ||||||
|                 if dir == "left": |  | ||||||
|                     next_element_url = next.previous_url |  | ||||||
|                 else: |  | ||||||
|                     next_element_url = next.next_url |  | ||||||
|             counter += 1 |  | ||||||
|  |  | ||||||
|         return elements |  | ||||||
|  |  | ||||||
|     @anoncritical |  | ||||||
|     @_iorepeat(3, "saving learning module page") |  | ||||||
|     async def _download_learning_module_page( |  | ||||||
|         self, |  | ||||||
|         parent_path: PurePath, |  | ||||||
|         element: IliasLearningModulePage, |  | ||||||
|         prev: Optional[str], |  | ||||||
|         next: Optional[str] |  | ||||||
|     ) -> None: |  | ||||||
|         path = parent_path / (_sanitize_path_name(element.title) + ".html") |  | ||||||
|         maybe_dl = await self.download(path) |  | ||||||
|         if not maybe_dl: |  | ||||||
|             return |  | ||||||
|         my_path = self._transformer.transform(maybe_dl.path) |  | ||||||
|         if not my_path: |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         if prev: |  | ||||||
|             prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) |  | ||||||
|             if prev_p: |  | ||||||
|                 prev = os.path.relpath(prev_p, my_path.parent) |  | ||||||
|             else: |  | ||||||
|                 prev = None |  | ||||||
|         if next: |  | ||||||
|             next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) |  | ||||||
|             if next_p: |  | ||||||
|                 next = os.path.relpath(next_p, my_path.parent) |  | ||||||
|             else: |  | ||||||
|                 next = None |  | ||||||
|  |  | ||||||
|         async with maybe_dl as (bar, sink): |  | ||||||
|             content = element.content |  | ||||||
|             content = await self.internalize_images(content) |  | ||||||
|             sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) |  | ||||||
|             sink.done() |  | ||||||
|  |  | ||||||
|     async def internalize_images(self, tag: Tag) -> Tag: |  | ||||||
|         """ |  | ||||||
|         Tries to fetch ILIAS images and embed them as base64 data. |  | ||||||
|         """ |  | ||||||
|         log.explain_topic("Internalizing images") |  | ||||||
|         for elem in tag.find_all(recursive=True): |  | ||||||
|             if not isinstance(elem, Tag): |  | ||||||
|                 continue |  | ||||||
|             if elem.name == "img": |  | ||||||
|                 if src := elem.attrs.get("src", None): |  | ||||||
|                     url = urljoin(_ILIAS_URL, src) |  | ||||||
|                     if not url.startswith(_ILIAS_URL): |  | ||||||
|                         continue |  | ||||||
|                     log.explain(f"Internalizing {url!r}") |  | ||||||
|                     img = await self._get_authenticated(url) |  | ||||||
|                     elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() |  | ||||||
|             if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): |  | ||||||
|                 # For unknown reasons the protocol seems to be stripped. |  | ||||||
|                 elem.attrs["src"] = "https:" + elem.attrs["src"] |  | ||||||
|         return tag |  | ||||||
|  |  | ||||||
|     async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: |  | ||||||
|         auth_id = await self._current_auth_id() |         auth_id = await self._current_auth_id() | ||||||
|         async with self.session.get(url) as request: |         async with self.session.get(url) as request: | ||||||
|             soup = soupify(await request.read()) |             soup = soupify(await request.read()) | ||||||
|             if IliasPage.is_logged_in(soup): |             if self._is_logged_in(soup): | ||||||
|                 return self._verify_page(soup, url, root_page_allowed) |                 return soup | ||||||
|  |  | ||||||
|         # We weren't authenticated, so try to do that |         # We weren't authenticated, so try to do that | ||||||
|         await self.authenticate(auth_id) |         await self.authenticate(auth_id) | ||||||
| @@ -927,28 +739,15 @@ instance's greatest bottleneck. | |||||||
|         # Retry once after authenticating. If this fails, we will die. |         # Retry once after authenticating. If this fails, we will die. | ||||||
|         async with self.session.get(url) as request: |         async with self.session.get(url) as request: | ||||||
|             soup = soupify(await request.read()) |             soup = soupify(await request.read()) | ||||||
|             if IliasPage.is_logged_in(soup): |             if self._is_logged_in(soup): | ||||||
|                 return self._verify_page(soup, url, root_page_allowed) |  | ||||||
|         raise CrawlError(f"get_page failed even after authenticating on {url!r}") |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: |  | ||||||
|         if IliasPage.is_root_page(soup) and not root_page_allowed: |  | ||||||
|             raise CrawlError( |  | ||||||
|                 "Unexpectedly encountered ILIAS root page. " |  | ||||||
|                 "This usually happens because the ILIAS instance is broken. " |  | ||||||
|                 "If so, wait a day or two and try again. " |  | ||||||
|                 "It could also happen because a crawled element links to the ILIAS root page. " |  | ||||||
|                 "If so, use a transform with a ! as target to ignore the particular element. " |  | ||||||
|                 f"The redirect came from {url}" |  | ||||||
|             ) |  | ||||||
|                 return soup |                 return soup | ||||||
|  |         raise CrawlError("get_page failed even after authenticating") | ||||||
|  |  | ||||||
|     async def _post_authenticated( |     async def _post_authenticated( | ||||||
|         self, |         self, | ||||||
|         url: str, |         url: str, | ||||||
|         data: dict[str, Union[str, List[str]]] |         data: dict[str, Union[str, List[str]]] | ||||||
|     ) -> bytes: |     ) -> BeautifulSoup: | ||||||
|         auth_id = await self._current_auth_id() |         auth_id = await self._current_auth_id() | ||||||
|  |  | ||||||
|         form_data = aiohttp.FormData() |         form_data = aiohttp.FormData() | ||||||
| @@ -968,28 +767,40 @@ instance's greatest bottleneck. | |||||||
|                 return await request.read() |                 return await request.read() | ||||||
|         raise CrawlError("post_authenticated failed even after authenticating") |         raise CrawlError("post_authenticated failed even after authenticating") | ||||||
|  |  | ||||||
|     async def _get_authenticated(self, url: str) -> bytes: |  | ||||||
|         auth_id = await self._current_auth_id() |  | ||||||
|  |  | ||||||
|         async with self.session.get(url, allow_redirects=False) as request: |  | ||||||
|             if request.status == 200: |  | ||||||
|                 return await request.read() |  | ||||||
|  |  | ||||||
|         # We weren't authenticated, so try to do that |  | ||||||
|         await self.authenticate(auth_id) |  | ||||||
|  |  | ||||||
|         # Retry once after authenticating. If this fails, we will die. |  | ||||||
|         async with self.session.get(url, allow_redirects=False) as request: |  | ||||||
|             if request.status == 200: |  | ||||||
|                 return await request.read() |  | ||||||
|         raise CrawlError("get_authenticated failed even after authenticating") |  | ||||||
|  |  | ||||||
|     # We repeat this as the login method in shibboleth doesn't handle I/O errors. |     # We repeat this as the login method in shibboleth doesn't handle I/O errors. | ||||||
|     # Shibboleth is quite reliable as well, the repeat is likely not critical here. |     # Shibboleth is quite reliable as well, the repeat is likely not critical here. | ||||||
|     @ _iorepeat(3, "Login", failure_is_error=True) |     @ _iorepeat(3, "Login", failure_is_error=True) | ||||||
|     async def _authenticate(self) -> None: |     async def _authenticate(self) -> None: | ||||||
|         await self._shibboleth_login.login(self.session) |         await self._shibboleth_login.login(self.session) | ||||||
|  |  | ||||||
|  |     @ staticmethod | ||||||
|  |     def _is_logged_in(soup: BeautifulSoup) -> bool: | ||||||
|  |         # Normal ILIAS pages | ||||||
|  |         mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") | ||||||
|  |         if mainbar is not None: | ||||||
|  |             login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) | ||||||
|  |             shib_login = soup.find(id="button_shib_login") | ||||||
|  |             return not login_button and not shib_login | ||||||
|  |  | ||||||
|  |         # Personal Desktop | ||||||
|  |         if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         # Video listing embeds do not have complete ILIAS html. Try to match them by | ||||||
|  |         # their video listing table | ||||||
|  |         video_table = soup.find( | ||||||
|  |             recursive=True, | ||||||
|  |             name="table", | ||||||
|  |             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} | ||||||
|  |         ) | ||||||
|  |         if video_table is not None: | ||||||
|  |             return True | ||||||
|  |         # The individual video player wrapper page has nothing of the above. | ||||||
|  |         # Match it by its playerContainer. | ||||||
|  |         if soup.select_one("#playerContainer") is not None: | ||||||
|  |             return True | ||||||
|  |         return False | ||||||
|  |  | ||||||
|  |  | ||||||
| class KitShibbolethLogin: | class KitShibbolethLogin: | ||||||
|     """ |     """ | ||||||
| @@ -1136,7 +947,7 @@ async def _shib_post( | |||||||
|         async with session.get(correct_url, allow_redirects=False) as response: |         async with session.get(correct_url, allow_redirects=False) as response: | ||||||
|             location = response.headers.get("location") |             location = response.headers.get("location") | ||||||
|             log.explain(f"Redirected to {location!r} with status {response.status}") |             log.explain(f"Redirected to {location!r} with status {response.status}") | ||||||
|             # If shib still has a valid session, it will directly respond to the request |             # If shib still still has a valid session, it will directly respond to the request | ||||||
|             if location is None: |             if location is None: | ||||||
|                 log.explain("Shib recognized us, returning its response directly") |                 log.explain("Shib recognized us, returning its response directly") | ||||||
|                 return soupify(await response.read()) |                 return soupify(await response.read()) | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ import os | |||||||
| import re | import re | ||||||
| from dataclasses import dataclass | from dataclasses import dataclass | ||||||
| from pathlib import PurePath | from pathlib import PurePath | ||||||
| from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union | from typing import Awaitable, List, Optional, Pattern, Set, Union | ||||||
| from urllib.parse import urljoin | from urllib.parse import urljoin | ||||||
|  |  | ||||||
| from bs4 import BeautifulSoup, Tag | from bs4 import BeautifulSoup, Tag | ||||||
| @@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler): | |||||||
|             await self._stream_from_url(file.url, sink, bar) |             await self._stream_from_url(file.url, sink, bar) | ||||||
|  |  | ||||||
|     async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: |     async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: | ||||||
|         page, url = await self.get_page() |         page = await self.get_page() | ||||||
|         elements: List[Tag] = self._find_file_links(page) |         elements: List[Tag] = self._find_file_links(page) | ||||||
|         items: Set[Union[KitIpdFile, KitIpdFolder]] = set() |         items: Set[Union[KitIpdFile, KitIpdFolder]] = set() | ||||||
|  |  | ||||||
|         for element in elements: |         for element in elements: | ||||||
|             folder_label = self._find_folder_label(element) |             folder_label = self._find_folder_label(element) | ||||||
|             if folder_label: |             if folder_label: | ||||||
|                 folder = self._extract_folder(folder_label, url) |                 folder = self._extract_folder(folder_label) | ||||||
|                 if folder not in items: |                 if folder not in items: | ||||||
|                     items.add(folder) |                     items.add(folder) | ||||||
|                     folder.explain() |                     folder.explain() | ||||||
|             else: |             else: | ||||||
|                 file = self._extract_file(element, url) |                 file = self._extract_file(element) | ||||||
|                 items.add(file) |                 items.add(file) | ||||||
|                 log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") |                 log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") | ||||||
|                 log.explain("Attributing it to root folder") |                 log.explain("Attributing it to root folder") | ||||||
|  |  | ||||||
|         return items |         return items | ||||||
|  |  | ||||||
|     def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder: |     def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: | ||||||
|         files: List[KitIpdFile] = [] |         files: List[KitIpdFile] = [] | ||||||
|         name = folder_tag.getText().strip() |         name = folder_tag.getText().strip() | ||||||
|  |  | ||||||
|         container: Tag = folder_tag.findNextSibling(name="table") |         container: Tag = folder_tag.findNextSibling(name="table") | ||||||
|         for link in self._find_file_links(container): |         for link in self._find_file_links(container): | ||||||
|             files.append(self._extract_file(link, url)) |             files.append(self._extract_file(link)) | ||||||
|  |  | ||||||
|         return KitIpdFolder(name, files) |         return KitIpdFolder(name, files) | ||||||
|  |  | ||||||
| @@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler): | |||||||
|             return None |             return None | ||||||
|         return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) |         return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) | ||||||
|  |  | ||||||
|     def _extract_file(self, link: Tag, url: str) -> KitIpdFile: |     def _extract_file(self, link: Tag) -> KitIpdFile: | ||||||
|         url = self._abs_url_from_link(url, link) |         url = self._abs_url_from_link(link) | ||||||
|         name = os.path.basename(url) |         name = os.path.basename(url) | ||||||
|         return KitIpdFile(name, url) |         return KitIpdFile(name, url) | ||||||
|  |  | ||||||
|     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: |     def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: | ||||||
|         return tag.findAll(name="a", attrs={"href": self._file_regex}) |         return tag.findAll(name="a", attrs={"href": self._file_regex}) | ||||||
|  |  | ||||||
|     def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: |     def _abs_url_from_link(self, link_tag: Tag) -> str: | ||||||
|         return urljoin(url, link_tag.get("href")) |         return urljoin(self._url, link_tag.get("href")) | ||||||
|  |  | ||||||
|     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: |     async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: | ||||||
|         async with self.session.get(url, allow_redirects=False) as resp: |         async with self.session.get(url, allow_redirects=False) as resp: | ||||||
| @@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler): | |||||||
|  |  | ||||||
|             sink.done() |             sink.done() | ||||||
|  |  | ||||||
|     async def get_page(self) -> Tuple[BeautifulSoup, str]: |     async def get_page(self) -> BeautifulSoup: | ||||||
|         async with self.session.get(self._url) as request: |         async with self.session.get(self._url) as request: | ||||||
|             # The web page for Algorithmen für Routenplanung contains some |             # The web page for Algorithmen für Routenplanung contains some | ||||||
|             # weird comments that beautifulsoup doesn't parse correctly. This |             # weird comments that beautifulsoup doesn't parse correctly. This | ||||||
| @@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler): | |||||||
|             # cause issues on other pages. |             # cause issues on other pages. | ||||||
|             content = (await request.read()).decode("utf-8") |             content = (await request.read()).decode("utf-8") | ||||||
|             content = re.sub(r"<!--.*?-->", "", content) |             content = re.sub(r"<!--.*?-->", "", content) | ||||||
|             return soupify(content.encode("utf-8")), str(request.url) |             return soupify(content.encode("utf-8")) | ||||||
|   | |||||||
| @@ -14,7 +14,7 @@ def name_variants(path: PurePath) -> Iterator[PurePath]: | |||||||
|  |  | ||||||
|  |  | ||||||
| class Deduplicator: | class Deduplicator: | ||||||
|     FORBIDDEN_CHARS = '<>:"/\\|?*' + "".join([chr(i) for i in range(0, 32)]) |     FORBIDDEN_CHARS = '<>:"/\\|?*' | ||||||
|     FORBIDDEN_NAMES = { |     FORBIDDEN_NAMES = { | ||||||
|         "CON", "PRN", "AUX", "NUL", |         "CON", "PRN", "AUX", "NUL", | ||||||
|         "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", |         "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", | ||||||
|   | |||||||
| @@ -59,7 +59,6 @@ class Log: | |||||||
|         # Whether different parts of the output are enabled or disabled |         # Whether different parts of the output are enabled or disabled | ||||||
|         self.output_explain = False |         self.output_explain = False | ||||||
|         self.output_status = True |         self.output_status = True | ||||||
|         self.output_not_deleted = True |  | ||||||
|         self.output_report = True |         self.output_report = True | ||||||
|  |  | ||||||
|     def _update_live(self) -> None: |     def _update_live(self) -> None: | ||||||
| @@ -208,17 +207,6 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | |||||||
|             action = escape(f"{action:<{self.STATUS_WIDTH}}") |             action = escape(f"{action:<{self.STATUS_WIDTH}}") | ||||||
|             self.print(f"{style}{action}[/] {escape(text)} {suffix}") |             self.print(f"{style}{action}[/] {escape(text)} {suffix}") | ||||||
|  |  | ||||||
|     def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: |  | ||||||
|         """ |  | ||||||
|         Print a message for a local only file that wasn't |  | ||||||
|         deleted while crawling. Allows markup in the "style" |  | ||||||
|         argument which will be applied to the "action" string. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if self.output_status and self.output_not_deleted: |  | ||||||
|             action = escape(f"{action:<{self.STATUS_WIDTH}}") |  | ||||||
|             self.print(f"{style}{action}[/] {escape(text)} {suffix}") |  | ||||||
|  |  | ||||||
|     def report(self, text: str) -> None: |     def report(self, text: str) -> None: | ||||||
|         """ |         """ | ||||||
|         Print a report after crawling. Allows markup. |         Print a report after crawling. Allows markup. | ||||||
| @@ -227,14 +215,6 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new | |||||||
|         if self.output_report: |         if self.output_report: | ||||||
|             self.print(text) |             self.print(text) | ||||||
|  |  | ||||||
|     def report_not_deleted(self, text: str) -> None: |  | ||||||
|         """ |  | ||||||
|         Print a report for a local only file that wasn't deleted after crawling. Allows markup. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if self.output_report and self.output_not_deleted: |  | ||||||
|             self.print(text) |  | ||||||
|  |  | ||||||
|     @contextmanager |     @contextmanager | ||||||
|     def _bar( |     def _bar( | ||||||
|             self, |             self, | ||||||
|   | |||||||
| @@ -44,7 +44,6 @@ class OnConflict(Enum): | |||||||
|     LOCAL_FIRST = "local-first" |     LOCAL_FIRST = "local-first" | ||||||
|     REMOTE_FIRST = "remote-first" |     REMOTE_FIRST = "remote-first" | ||||||
|     NO_DELETE = "no-delete" |     NO_DELETE = "no-delete" | ||||||
|     NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" |  | ||||||
|  |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def from_string(string: str) -> "OnConflict": |     def from_string(string: str) -> "OnConflict": | ||||||
| @@ -52,7 +51,7 @@ class OnConflict(Enum): | |||||||
|             return OnConflict(string) |             return OnConflict(string) | ||||||
|         except ValueError: |         except ValueError: | ||||||
|             raise ValueError("must be one of 'prompt', 'local-first'," |             raise ValueError("must be one of 'prompt', 'local-first'," | ||||||
|                              " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") |                              " 'remote-first', 'no-delete'") | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass | @dataclass | ||||||
| @@ -265,7 +264,7 @@ class OutputDirectory: | |||||||
|             on_conflict: OnConflict, |             on_conflict: OnConflict, | ||||||
|             path: PurePath, |             path: PurePath, | ||||||
|     ) -> bool: |     ) -> bool: | ||||||
|         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: |         if on_conflict == OnConflict.PROMPT: | ||||||
|             async with log.exclusive_output(): |             async with log.exclusive_output(): | ||||||
|                 prompt = f"Replace {fmt_path(path)} with remote file?" |                 prompt = f"Replace {fmt_path(path)} with remote file?" | ||||||
|                 return await prompt_yes_no(prompt, default=False) |                 return await prompt_yes_no(prompt, default=False) | ||||||
| @@ -284,7 +283,7 @@ class OutputDirectory: | |||||||
|             on_conflict: OnConflict, |             on_conflict: OnConflict, | ||||||
|             path: PurePath, |             path: PurePath, | ||||||
|     ) -> bool: |     ) -> bool: | ||||||
|         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: |         if on_conflict == OnConflict.PROMPT: | ||||||
|             async with log.exclusive_output(): |             async with log.exclusive_output(): | ||||||
|                 prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" |                 prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" | ||||||
|                 return await prompt_yes_no(prompt, default=False) |                 return await prompt_yes_no(prompt, default=False) | ||||||
| @@ -304,7 +303,7 @@ class OutputDirectory: | |||||||
|             path: PurePath, |             path: PurePath, | ||||||
|             parent: PurePath, |             parent: PurePath, | ||||||
|     ) -> bool: |     ) -> bool: | ||||||
|         if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: |         if on_conflict == OnConflict.PROMPT: | ||||||
|             async with log.exclusive_output(): |             async with log.exclusive_output(): | ||||||
|                 prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" |                 prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" | ||||||
|                 return await prompt_yes_no(prompt, default=False) |                 return await prompt_yes_no(prompt, default=False) | ||||||
| @@ -331,7 +330,7 @@ class OutputDirectory: | |||||||
|             return False |             return False | ||||||
|         elif on_conflict == OnConflict.REMOTE_FIRST: |         elif on_conflict == OnConflict.REMOTE_FIRST: | ||||||
|             return True |             return True | ||||||
|         elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: |         elif on_conflict == OnConflict.NO_DELETE: | ||||||
|             return False |             return False | ||||||
|  |  | ||||||
|         # This should never be reached |         # This should never be reached | ||||||
| @@ -496,7 +495,7 @@ class OutputDirectory: | |||||||
|             except OSError: |             except OSError: | ||||||
|                 pass |                 pass | ||||||
|         else: |         else: | ||||||
|             log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) |             log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) | ||||||
|             self._report.not_delete_file(pure) |             self._report.not_delete_file(pure) | ||||||
|  |  | ||||||
|     def load_prev_report(self) -> None: |     def load_prev_report(self) -> None: | ||||||
|   | |||||||
| @@ -180,7 +180,7 @@ class Pferd: | |||||||
|                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") |                 log.report(f"  [bold bright_magenta]Deleted[/] {fmt_path(path)}") | ||||||
|             for path in sorted(crawler.report.not_deleted_files): |             for path in sorted(crawler.report.not_deleted_files): | ||||||
|                 something_changed = True |                 something_changed = True | ||||||
|                 log.report_not_deleted(f"  [bold bright_magenta]Not deleted[/] {fmt_path(path)}") |                 log.report(f"  [bold bright_magenta]Not deleted[/] {fmt_path(path)}") | ||||||
|  |  | ||||||
|             for warning in crawler.report.encountered_warnings: |             for warning in crawler.report.encountered_warnings: | ||||||
|                 something_changed = True |                 something_changed = True | ||||||
|   | |||||||
							
								
								
									
										53
									
								
								PFERD/update.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								PFERD/update.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,53 @@ | |||||||
|  | from dataclasses import dataclass | ||||||
|  | import ssl | ||||||
|  | from typing import Optional | ||||||
|  | import aiohttp | ||||||
|  | import certifi | ||||||
|  |  | ||||||
|  | from .version import NAME, VERSION | ||||||
|  | from .logging import log | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class PferdUpdate: | ||||||
|  |     release_url: str | ||||||
|  |     version: str | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _build_session() -> aiohttp.ClientSession: | ||||||
|  |     return aiohttp.ClientSession( | ||||||
|  |         headers={"User-Agent": f"{NAME}/{VERSION}"}, | ||||||
|  |         connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), | ||||||
|  |         timeout=aiohttp.ClientTimeout( | ||||||
|  |             total=15 * 60, | ||||||
|  |             connect=10, | ||||||
|  |             sock_connect=10, | ||||||
|  |             sock_read=10, | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def check_for_updates() -> None: | ||||||
|  |     if new_version := await get_newer_version(): | ||||||
|  |         log.warn( | ||||||
|  |             f"{NAME} version out of date. " | ||||||
|  |             + f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub." | ||||||
|  |         ) | ||||||
|  |         log.warn_contd(f"You can download it on GitHub: {new_version.release_url}") | ||||||
|  |     else: | ||||||
|  |         log.explain("No update found") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | async def get_newer_version() -> Optional[PferdUpdate]: | ||||||
|  |     async with _build_session() as session: | ||||||
|  |         async with session.get( | ||||||
|  |             "https://api.github.com/repos/Garmelon/Pferd/releases/latest", | ||||||
|  |             headers={"Accept": "application/vnd.github+json"} | ||||||
|  |         ) as response: | ||||||
|  |             release_information = await response.json() | ||||||
|  |             tag_name: str = release_information["tag_name"] | ||||||
|  |             tag_name = tag_name.removeprefix("v") | ||||||
|  |             if VERSION == tag_name: | ||||||
|  |                 return None | ||||||
|  |  | ||||||
|  |             return PferdUpdate(release_url=release_information["html_url"], version=tag_name) | ||||||
| @@ -1,2 +1,2 @@ | |||||||
| NAME = "PFERD" | NAME = "PFERD" | ||||||
| VERSION = "3.5.2" | VERSION = "3.4.1" | ||||||
|   | |||||||
| @@ -30,10 +30,7 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. | |||||||
|  |  | ||||||
| Unofficial packages are available for: | Unofficial packages are available for: | ||||||
| - [AUR](https://aur.archlinux.org/packages/pferd) | - [AUR](https://aur.archlinux.org/packages/pferd) | ||||||
| - [brew](https://formulae.brew.sh/formula/pferd) |  | ||||||
| - [conda-forge](https://github.com/conda-forge/pferd-feedstock) |  | ||||||
| - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) | - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) | ||||||
| - [PyPi](https://pypi.org/project/pferd) |  | ||||||
|  |  | ||||||
| See also PFERD's [repology page](https://repology.org/project/pferd/versions). | See also PFERD's [repology page](https://repology.org/project/pferd/versions). | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										27
									
								
								flake.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										27
									
								
								flake.lock
									
									
									
										generated
									
									
									
								
							| @@ -1,27 +0,0 @@ | |||||||
| { |  | ||||||
|   "nodes": { |  | ||||||
|     "nixpkgs": { |  | ||||||
|       "locked": { |  | ||||||
|         "lastModified": 1708979614, |  | ||||||
|         "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=", |  | ||||||
|         "owner": "NixOS", |  | ||||||
|         "repo": "nixpkgs", |  | ||||||
|         "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a", |  | ||||||
|         "type": "github" |  | ||||||
|       }, |  | ||||||
|       "original": { |  | ||||||
|         "owner": "NixOS", |  | ||||||
|         "ref": "nixos-23.11", |  | ||||||
|         "repo": "nixpkgs", |  | ||||||
|         "type": "github" |  | ||||||
|       } |  | ||||||
|     }, |  | ||||||
|     "root": { |  | ||||||
|       "inputs": { |  | ||||||
|         "nixpkgs": "nixpkgs" |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   }, |  | ||||||
|   "root": "root", |  | ||||||
|   "version": 7 |  | ||||||
| } |  | ||||||
							
								
								
									
										41
									
								
								flake.nix
									
									
									
									
									
								
							
							
						
						
									
										41
									
								
								flake.nix
									
									
									
									
									
								
							| @@ -1,41 +0,0 @@ | |||||||
| { |  | ||||||
|   description = "Tool for downloading course-related files from ILIAS"; |  | ||||||
|  |  | ||||||
|   inputs = { |  | ||||||
|     nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11"; |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|   outputs = { self, nixpkgs }: |  | ||||||
|     let |  | ||||||
|       # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. |  | ||||||
|       forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; |  | ||||||
|     in |  | ||||||
|     { |  | ||||||
|       packages = forAllSystems (system: |  | ||||||
|         let pkgs = import nixpkgs { inherit system; }; |  | ||||||
|         in |  | ||||||
|         rec { |  | ||||||
|           default = pkgs.python3Packages.buildPythonApplication rec { |  | ||||||
|             pname = "pferd"; |  | ||||||
|             # Performing black magic |  | ||||||
|             # Don't worry, I sacrificed enough goats for the next few years |  | ||||||
|             version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; |  | ||||||
|             format = "pyproject"; |  | ||||||
|  |  | ||||||
|             src = ./.; |  | ||||||
|  |  | ||||||
|             nativeBuildInputs = with pkgs.python3Packages; [ |  | ||||||
|               setuptools |  | ||||||
|             ]; |  | ||||||
|  |  | ||||||
|             propagatedBuildInputs = with pkgs.python3Packages; [ |  | ||||||
|               aiohttp |  | ||||||
|               beautifulsoup4 |  | ||||||
|               rich |  | ||||||
|               keyring |  | ||||||
|               certifi |  | ||||||
|             ]; |  | ||||||
|           }; |  | ||||||
|         }); |  | ||||||
|     }; |  | ||||||
| } |  | ||||||
		Reference in New Issue
	
	Block a user