Compare commits

...

11 Commits

Author SHA1 Message Date
465f8b28c0 Bump version to 3.8.3 2025-07-01 14:28:30 +02:00
27e69af2f3 Update changelog for 8caad00 2025-07-01 14:26:10 +02:00
56e3065950 Document usage of pilot.ilias.studium.kit.edu (#111) 2025-05-30 17:13:45 +02:00
549ce6cce9 Ignore unavailable elements (#119) 2025-05-28 17:04:57 +02:00
34564cedb4 Add support for link collections 2025-05-27 16:25:59 +02:00
2b0d20a1f6 Fix crawling of exercises with instructions
We do not want a second path and the instruction field has an identical
link...
2025-05-26 14:42:38 +02:00
8caad0008d Fix check for nonexistent ilias_url command attribute to base_url (#113) 2025-05-05 22:05:54 +02:00
77a23265a9 Bump version to 3.8.2 2025-04-29 17:55:57 +02:00
4c230ef6dd Fix exercise crawling 2025-04-25 13:45:57 +02:00
b305e1ce23 Fix login using the native ilias login form 2025-04-23 16:08:45 +02:00
bdf17f5c87 Ignore wikis 2025-04-23 16:03:37 +02:00
7 changed files with 332 additions and 153 deletions

View File

@ -22,6 +22,30 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.8.3 - 2025-07-01
## Added
- Support for link collections.
In "fancy" mode, a single HTML file with multiple links is generated.
In all other modes, PFERD creates a folder for the collection and a new file
for every link inside.
## Fixed
- Crawling of exercises with instructions
- Don't download unavailable elements.
Elements that are unavailable (for example, because their availability is
time restricted) will not download the HTML for the info page anymore.
- `base_url` argument for `ilias-web` crawler causing crashes
## 3.8.2 - 2025-04-29
## Changed
- Explicitly mention that wikis are not supported at the moment and ignore them
## Fixed
- Ilias-native login
- Exercise crawling
## 3.8.1 - 2025-04-17 ## 3.8.1 - 2025-04-17
## Fixed ## Fixed

View File

@ -163,13 +163,14 @@ out of the box for the corresponding universities:
[ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs"
| University | `base_url` | `login_type` | `client_id` | | University | `base_url` | `login_type` | `client_id` |
|---------------|-----------------------------------------|--------------|---------------| |-----------------|-----------------------------------------|--------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | local | elearning | | FH Aachen | https://www.ili.fh-aachen.de | local | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk | | Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ | | Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart | | Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart |
| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | | | Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | |
| KIT ILIAS Pilot | https://pilot.ilias.studium.kit.edu | shibboleth | pilot |
If your university isn't listed, try navigating to your instance's login page. If your university isn't listed, try navigating to your instance's login page.
Assuming no custom login service is used, the URL will look something like this: Assuming no custom login service is used, the URL will look something like this:

View File

@ -45,8 +45,8 @@ def load(
load_crawler(args, section) load_crawler(args, section)
section["type"] = COMMAND_NAME section["type"] = COMMAND_NAME
if args.ilias_url is not None: if args.base_url is not None:
section["base_url"] = args.ilias_url section["base_url"] = args.base_url
if args.client_id is not None: if args.client_id is not None:
section["client_id"] = args.client_id section["client_id"] = args.client_id

View File

@ -1,3 +1,5 @@
import dataclasses
import re
from enum import Enum from enum import Enum
from typing import Optional, cast from typing import Optional, cast
@ -12,7 +14,9 @@ _link_template_fancy = """
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>ILIAS - Link: {{name}}</title> <title>ILIAS - Link: {{name}}</title>
<!-- REPEAT REMOVE START -->
<meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" />
<!-- REPEAT REMOVE END -->
</head> </head>
<style> <style>
@ -23,6 +27,8 @@ _link_template_fancy = """
display: flex; display: flex;
align-items: center; align-items: center;
justify-content: center; justify-content: center;
flex-direction: column;
gap: 4px;
} }
body { body {
padding: 0; padding: 0;
@ -31,11 +37,16 @@ _link_template_fancy = """
font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif;
height: 100vh; height: 100vh;
} }
.row { .column {
background-color: white;
min-width: 500px; min-width: 500px;
max-width: 90vw; max-width: 90vw;
display: flex; display: flex;
flex-direction: column;
row-gap: 5px;
}
.row {
background-color: white;
display: flex;
padding: 1em; padding: 1em;
} }
.logo { .logo {
@ -75,19 +86,23 @@ _link_template_fancy = """
} }
</style> </style>
<body class="center-flex"> <body class="center-flex">
<div class="row"> <div class="column">
<div class="logo center-flex"> <!-- REPEAT START -->
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> <div class="row">
<path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/> <div class="logo center-flex">
</svg> <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
</div> <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/>
<div class="tile"> </svg>
<div class="top-row">
<a href="{{link}}">{{name}}</a>
</div> </div>
<div class="bottom-row">{{description}}</div> <div class="tile">
<div class="top-row">
<a href="{{link}}">{{name}}</a>
</div>
<div class="bottom-row">{{description}}</div>
</div>
<div class="menu-button center-flex"> ⯆ </div>
</div> </div>
<div class="menu-button center-flex"> ⯆ </div> <!-- REPEAT END -->
</div> </div>
</body> </body>
</html> </html>
@ -255,6 +270,13 @@ def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Ta
.replace("{{content}}", cast(str, content.prettify())) .replace("{{content}}", cast(str, content.prettify()))
@dataclasses.dataclass
class LinkData:
name: str
url: str
description: str
class Links(Enum): class Links(Enum):
IGNORE = "ignore" IGNORE = "ignore"
PLAINTEXT = "plaintext" PLAINTEXT = "plaintext"
@ -272,6 +294,11 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def collection_as_one(self) -> bool:
if self == Links.FANCY:
return True
return False
def extension(self) -> Optional[str]: def extension(self) -> Optional[str]:
if self == Links.FANCY: if self == Links.FANCY:
return ".html" return ".html"
@ -283,10 +310,48 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def interpolate(self, redirect_delay: int, collection_name: str, links: list[LinkData]) -> str:
template = self.template()
if template is None:
raise ValueError("Cannot interpolate ignored links")
if len(links) == 1:
link = links[0]
content = template
content = content.replace("{{link}}", link.url)
content = content.replace("{{name}}", link.name)
content = content.replace("{{description}}", link.description)
content = content.replace("{{redirect_delay}}", str(redirect_delay))
return content
if self == Links.PLAINTEXT or self == Links.INTERNET_SHORTCUT:
return "\n".join(f"{link.url}" for link in links)
# All others get coerced to fancy
content = cast(str, Links.FANCY.template())
repeated_content = cast(
re.Match[str],
re.search(r"<!-- REPEAT START -->([\s\S]+)<!-- REPEAT END -->", content)
).group(1)
parts = []
for link in links:
instance = repeated_content
instance = instance.replace("{{link}}", link.url)
instance = instance.replace("{{name}}", link.name)
instance = instance.replace("{{description}}", link.description)
instance = instance.replace("{{redirect_delay}}", str(redirect_delay))
parts.append(instance)
content = content.replace(repeated_content, "\n".join(parts))
content = content.replace("{{name}}", collection_name)
content = re.sub(r"<!-- REPEAT REMOVE START -->[\s\S]+<!-- REPEAT REMOVE END -->", "", content)
return content
@staticmethod @staticmethod
def from_string(string: str) -> "Links": def from_string(string: str) -> "Links":
try: try:
return Links(string) return Links(string)
except ValueError: except ValueError:
raise ValueError("must be one of 'ignore', 'plaintext'," options = [f"'{option.value}'" for option in Links]
" 'html', 'internet-shortcut'") raise ValueError(f"must be one of {', '.join(options)}")

View File

@ -19,7 +19,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat from .async_helper import _iorepeat
from .file_templates import Links, forum_thread_template, learning_module_template from .file_templates import LinkData, Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export) IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
@ -107,6 +107,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = { _DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES, IliasElementType.EXERCISE_FILES,
IliasElementType.EXERCISE_OVERVIEW,
IliasElementType.FOLDER, IliasElementType.FOLDER,
IliasElementType.INFO_TAB, IliasElementType.INFO_TAB,
IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.MEDIACAST_VIDEO_FOLDER,
@ -328,6 +329,15 @@ instance's greatest bottleneck.
# directory escape attacks. # directory escape attacks.
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
# This is symptomatic of no access to the element, for example, because
# of time availability restrictions.
if "cmdClass=ilInfoScreenGUI" in element.url and "cmd=showSummary" in element.url:
log.explain(
"Skipping element as url points to info screen, "
"this should only happen with not-yet-released elements"
)
return None
if element.type in _VIDEO_ELEMENTS: if element.type in _VIDEO_ELEMENTS:
if not self._videos: if not self._videos:
log.status( log.status(
@ -424,10 +434,20 @@ instance's greatest bottleneck.
"[bright_black](not descending into linked course)" "[bright_black](not descending into linked course)"
) )
return None return None
elif element.type == IliasElementType.WIKI:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](wikis are not currently supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE: elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path) return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path) return await self._handle_link(element, element_path)
elif element.type == IliasElementType.LINK_COLLECTION:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING: elif element.type == IliasElementType.BOOKING:
return await self._handle_booking(element, element_path) return await self._handle_booking(element, element_path)
elif element.type == IliasElementType.OPENCAST_VIDEO: elif element.type == IliasElementType.OPENCAST_VIDEO:
@ -453,44 +473,97 @@ instance's greatest bottleneck.
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}") log.explain(f"Links type is {self._links}")
link_template_maybe = self._links.template() export_url = url_set_query_param(element.url, "cmd", "exportHTML")
link_extension = self._links.extension() resolved = await self._resolve_link_target(export_url)
if not link_template_maybe or not link_extension: if resolved == "none":
links = [LinkData(element.name, "", element.description or "")]
else:
links = self._parse_link_content(element, cast(BeautifulSoup, resolved))
maybe_extension = self._links.extension()
if not maybe_extension:
log.explain("Answer: No") log.explain("Answer: No")
return None return None
else: else:
log.explain("Answer: Yes") log.explain("Answer: Yes")
element_path = element_path.with_name(element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime) if len(links) <= 1 or self._links.collection_as_one():
if not maybe_dl: element_path = element_path.with_name(element_path.name + maybe_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
return None
return self._download_link(self._links, element.name, links, maybe_dl)
maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None return None
# Required for download_all closure
cl = maybe_cl
extension = maybe_extension
return self._download_link(element, link_template_maybe, maybe_dl) async def download_all() -> None:
for link in links:
path = cl.path / (_sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl)
return download_all()
@anoncritical @anoncritical
@_iorepeat(3, "resolving link") @_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async def _download_link(
async with dl as (bar, sink):
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url)
self._write_link_content(link_template, real_url, element.name, element.description, sink)
def _write_link_content(
self, self,
link_template: str, link_renderer: Links,
url: str, collection_name: str,
name: str, links: list[LinkData],
description: Optional[str], dl: DownloadToken
sink: FileSink,
) -> None: ) -> None:
content = link_template async with dl as (bar, sink):
content = content.replace("{{link}}", url) rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links)
content = content.replace("{{name}}", name) sink.file.write(rendered.encode("utf-8"))
content = content.replace("{{description}}", str(description)) sink.done()
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8")) async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
sink.done() async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()) # .select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return "none"
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
@staticmethod
def _parse_link_content(element: IliasPageElement, content: BeautifulSoup) -> list[LinkData]:
links = cast(list[Tag], list(content.select("a")))
if len(links) == 1:
url = str(links[0].get("href")).strip()
return [LinkData(name=element.name, description=element.description or "", url=url)]
results = []
for link in links:
url = str(link.get("href")).strip()
name = link.get_text(strip=True)
description = cast(Tag, link.find_next_sibling("dd")).get_text(strip=True)
results.append(LinkData(name=name, description=description, url=url.strip()))
return results
async def _handle_booking( async def _handle_booking(
self, self,
@ -515,7 +588,7 @@ instance's greatest bottleneck.
self._ensure_not_seen(element, element_path) self._ensure_not_seen(element, element_path)
return self._download_booking(element, link_template_maybe, maybe_dl) return self._download_booking(element, maybe_dl)
@anoncritical @anoncritical
@_iorepeat(1, "downloading description") @_iorepeat(1, "downloading description")
@ -536,36 +609,13 @@ instance's greatest bottleneck.
async def _download_booking( async def _download_booking(
self, self,
element: IliasPageElement, element: IliasPageElement,
link_template: str,
dl: DownloadToken, dl: DownloadToken,
) -> None: ) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
self._write_link_content(link_template, element.url, element.name, element.description, sink) links = [LinkData(name=element.name, description=element.description or "", url=element.url)]
rendered = self._links.interpolate(self._link_file_redirect_delay, element.name, links)
async def _resolve_link_target(self, export_url: str) -> str: sink.file.write(rendered.encode("utf-8"))
async def impl() -> Optional[str]: sink.done()
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return ""
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
async def _handle_opencast_video( async def _handle_opencast_video(
self, self,
@ -1031,7 +1081,7 @@ instance's greatest bottleneck.
async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request:
login_page = soupify(await request.read()) login_page = soupify(await request.read())
login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"})) login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "login_form"}))
if login_form is None: if login_form is None:
raise CrawlError("Could not find the login form! Specified client id might be invalid.") raise CrawlError("Could not find the login form! Specified client id might be invalid.")
@ -1041,14 +1091,12 @@ instance's greatest bottleneck.
username, password = await self._auth.credentials() username, password = await self._auth.credentials()
login_data = { login_form_data = aiohttp.FormData()
"username": username, login_form_data.add_field('login_form/input_3/input_4', username)
"password": password, login_form_data.add_field('login_form/input_3/input_5', password)
"cmd[doStandardAuthentication]": "Login",
}
# do the actual login # do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request: async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request:
soup = IliasSoup(soupify(await request.read()), str(request.url)) soup = IliasSoup(soupify(await request.read()), str(request.url))
if not IliasPage.is_logged_in(soup): if not IliasPage.is_logged_in(soup):
self._auth.invalidate_credentials() self._auth.invalidate_credentials()

View File

@ -97,7 +97,8 @@ class IliasElementType(Enum):
BOOKING = "booking" BOOKING = "booking"
COURSE = "course" COURSE = "course"
DCL_RECORD_LIST = "dcl_record_list" DCL_RECORD_LIST = "dcl_record_list"
EXERCISE = "exercise" EXERCISE_OVERVIEW = "exercise_overview"
EXERCISE = "exercise" # own submitted files
EXERCISE_FILES = "exercise_files" # own submitted files EXERCISE_FILES = "exercise_files" # own submitted files
FILE = "file" FILE = "file"
FOLDER = "folder" FOLDER = "folder"
@ -108,6 +109,7 @@ class IliasElementType(Enum):
LEARNING_MODULE_HTML = "learning_module_html" LEARNING_MODULE_HTML = "learning_module_html"
LITERATURE_LIST = "literature_list" LITERATURE_LIST = "literature_list"
LINK = "link" LINK = "link"
LINK_COLLECTION = "link_collection"
MEDIA_POOL = "media_pool" MEDIA_POOL = "media_pool"
MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO = "mediacast_video"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
@ -120,6 +122,7 @@ class IliasElementType(Enum):
SCORM_LEARNING_MODULE = "scorm_learning_module" SCORM_LEARNING_MODULE = "scorm_learning_module"
SURVEY = "survey" SURVEY = "survey"
TEST = "test" # an online test. Will be ignored currently. TEST = "test" # an online test. Will be ignored currently.
WIKI = "wiki"
def matcher(self) -> IliasElementMatcher: def matcher(self) -> IliasElementMatcher:
match self: match self:
@ -140,13 +143,15 @@ class IliasElementType(Enum):
TypeMatcher.query("cmdclass=ildclrecordlistgui") TypeMatcher.query("cmdclass=ildclrecordlistgui")
) )
case IliasElementType.EXERCISE: case IliasElementType.EXERCISE:
return TypeMatcher.never()
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.EXERCISE_OVERVIEW:
return TypeMatcher.any( return TypeMatcher.any(
TypeMatcher.path("/exc/"), TypeMatcher.path("/exc/"),
TypeMatcher.path("_exc_"), TypeMatcher.path("_exc_"),
TypeMatcher.img_src("_exc.svg"), TypeMatcher.img_src("_exc.svg"),
) )
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.FILE: case IliasElementType.FILE:
return TypeMatcher.any( return TypeMatcher.any(
TypeMatcher.query("cmd=sendfile"), TypeMatcher.query("cmd=sendfile"),
@ -198,7 +203,12 @@ class IliasElementType(Enum):
TypeMatcher.query("baseclass=illinkresourcehandlergui"), TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.query("calldirectlink"), TypeMatcher.query("calldirectlink"),
), ),
TypeMatcher.img_src("_webr.svg") TypeMatcher.img_src("_webr.svg") # duplicated :(
)
case IliasElementType.LINK_COLLECTION:
return TypeMatcher.any(
TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.img_src("_webr.svg") # duplicated :(
) )
case IliasElementType.MEDIA_POOL: case IliasElementType.MEDIA_POOL:
return TypeMatcher.any( return TypeMatcher.any(
@ -243,6 +253,11 @@ class IliasElementType(Enum):
TypeMatcher.query("cmdclass=iltestscreengui"), TypeMatcher.query("cmdclass=iltestscreengui"),
TypeMatcher.img_src("_tst.svg") TypeMatcher.img_src("_tst.svg")
) )
case IliasElementType.WIKI:
return TypeMatcher.any(
TypeMatcher.query("baseClass=ilwikihandlergui"),
TypeMatcher.img_src("wiki.svg")
)
raise CrawlWarning(f"Unknown matcher {self}") raise CrawlWarning(f"Unknown matcher {self}")
@ -524,6 +539,8 @@ class IliasPage:
if self._contains_collapsed_future_meetings(): if self._contains_collapsed_future_meetings():
log.explain("Requesting *all* future meetings") log.explain("Requesting *all* future meetings")
return self._uncollapse_future_meetings_url() return self._uncollapse_future_meetings_url()
if self._is_exercise_not_all_shown():
return self._show_all_exercises()
if not self._is_content_tab_selected(): if not self._is_content_tab_selected():
if self._page_type != IliasElementType.INFO_TAB: if self._page_type != IliasElementType.INFO_TAB:
log.explain("Selecting content tab") log.explain("Selecting content tab")
@ -555,7 +572,7 @@ class IliasPage:
def _is_exercise_file(self) -> bool: def _is_exercise_file(self) -> bool:
# we know it from before # we know it from before
if self._page_type == IliasElementType.EXERCISE: if self._page_type == IliasElementType.EXERCISE_OVERVIEW:
return True return True
# We have no suitable parent - let's guesss # We have no suitable parent - let's guesss
@ -592,6 +609,17 @@ class IliasPage:
link = self._abs_url_from_link(element) link = self._abs_url_from_link(element)
return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings") return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings")
def _is_exercise_not_all_shown(self) -> bool:
return (self._page_type == IliasElementType.EXERCISE_OVERVIEW
and "mode=all" not in self._page_url.lower())
def _show_all_exercises(self) -> Optional[IliasPageElement]:
return IliasPageElement.create_new(
IliasElementType.EXERCISE_OVERVIEW,
self._page_url + "&mode=all",
"show all exercises"
)
def _is_content_tab_selected(self) -> bool: def _is_content_tab_selected(self) -> bool:
return self._select_content_page_url() is None return self._select_content_page_url() is None
@ -857,15 +885,62 @@ class IliasPage:
def _find_exercise_entries(self) -> list[IliasPageElement]: def _find_exercise_entries(self) -> list[IliasPageElement]:
if self._soup.find(id="tab_submission"): if self._soup.find(id="tab_submission"):
log.explain("Found submission tab. This is an exercise detail page") log.explain("Found submission tab. This is an exercise detail or files page")
return self._find_exercise_entries_detail_page() if self._soup.select_one("#tab_submission.active") is None:
log.explain(" This is a details page")
return self._find_exercise_entries_detail_page()
else:
log.explain(" This is a files page")
return self._find_exercise_entries_files_page()
log.explain("Found no submission tab. This is an exercise root page") log.explain("Found no submission tab. This is an exercise root page")
return self._find_exercise_entries_root_page() return self._find_exercise_entries_root_page()
def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]: def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = [] results: list[IliasPageElement] = []
# Find all download links in the container (this will contain all the files) if link := cast(Optional[Tag], self._soup.select_one("#tab_submission > a")):
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
self._abs_url_from_link(link),
"Submission"
))
else:
log.explain("Found no submission link for exercise, maybe it has not started yet?")
# Find all download links in the container (this will contain all the *feedback* files)
download_links = cast(list[Tag], self._soup.find_all(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x is not None and "cmd=download" in x},
text="Download"
))
for link in download_links:
parent_row: Tag = cast(Tag, link.find_parent(
attrs={"class": lambda x: x is not None and "row" in x}))
name_tag = cast(Optional[Tag], parent_row.find(name="div"))
if not name_tag:
log.warn("Could not find name tag for exercise entry")
_unexpected_html_warning()
continue
name = _sanitize_path_name(name_tag.get_text().strip())
log.explain(f"Found exercise detail entry {name!r}")
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
self._abs_url_from_link(link),
name
))
return results
def _find_exercise_entries_files_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = []
# Find all download links in the container
download_links = cast(list[Tag], self._soup.find_all( download_links = cast(list[Tag], self._soup.find_all(
name="a", name="a",
# download links contain the given command class # download links contain the given command class
@ -878,7 +953,7 @@ class IliasPage:
children = cast(list[Tag], parent_row.find_all("td")) children = cast(list[Tag], parent_row.find_all("td"))
name = _sanitize_path_name(children[1].get_text().strip()) name = _sanitize_path_name(children[1].get_text().strip())
log.explain(f"Found exercise detail entry {name!r}") log.explain(f"Found exercise file entry {name!r}")
date = None date = None
for child in reversed(children): for child in reversed(children):
@ -886,7 +961,7 @@ class IliasPage:
if date is not None: if date is not None:
break break
if date is None: if date is None:
log.warn(f"Date parsing failed for exercise entry {name!r}") log.warn(f"Date parsing failed for exercise file entry {name!r}")
results.append(IliasPageElement.create_new( results.append(IliasPageElement.create_new(
IliasElementType.FILE, IliasElementType.FILE,
@ -900,66 +975,32 @@ class IliasPage:
def _find_exercise_entries_root_page(self) -> list[IliasPageElement]: def _find_exercise_entries_root_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = [] results: list[IliasPageElement] = []
# Each assignment is in an accordion container content_tab = cast(Optional[Tag], self._soup.find(id="ilContentContainer"))
assignment_containers: list[Tag] = self._soup.select(".il_VAccordionInnerContainer") if not content_tab:
log.warn("Could not find content tab in exercise overview page")
_unexpected_html_warning()
return []
for container in assignment_containers: exercise_links = content_tab.select(".il-item-title a")
# Fetch the container name out of the header to use it in the path
container_name = cast(Tag, container.select_one(".ilAssignmentHeader")).get_text().strip()
log.explain(f"Found exercise container {container_name!r}")
# Find all download links in the container (this will contain all the files) for exercise in cast(list[Tag], exercise_links):
files = cast(list[Tag], container.find_all( if "href" not in exercise.attrs:
name="a", continue
# download links contain the given command class href = exercise.attrs["href"]
attrs={"href": lambda x: x is not None and "cmdClass=ilexsubmissiongui" in x}, if type(href) is not str:
text="Download" continue
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue
name = _sanitize_path_name(exercise.get_text().strip())
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE,
self._abs_url_from_link(exercise),
name
)) ))
# Grab each file as you now have the link for result in results:
for file_link in files: log.explain(f"Found exercise {result.name!r}")
# Two divs, side by side. Left is the name, right is the link ==> get left
# sibling
file_name = cast(
Tag,
cast(Tag, file_link.parent).find_previous(name="div")
).get_text().strip()
url = self._abs_url_from_link(file_link)
log.explain(f"Found exercise entry {file_name!r}")
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
url,
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
mtime=None, # We do not have any timestamp
skip_sanitize=True
))
# Find all links to file listings (e.g. "Submitted Files" for groups)
file_listings = cast(list[Tag], container.find_all(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x is not None and "cmdclass=ilexsubmissionfilegui" in x.lower()}
))
# Add each listing as a new
for listing in file_listings:
parent_container = cast(Tag, listing.find_parent(
"div", attrs={"class": lambda x: x is not None and "form-group" in x}
))
label_container = cast(Tag, parent_container.find(
attrs={"class": lambda x: x is not None and "control-label" in x}
))
file_name = label_container.get_text().strip()
url = self._abs_url_from_link(listing)
log.explain(f"Found exercise detail {file_name!r} at {url}")
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
url,
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
None, # we do not have any timestamp
skip_sanitize=True
))
return results return results

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.8.1" VERSION = "3.8.3"