Compare commits

..

16 Commits

8 changed files with 408 additions and 309 deletions

View File

@ -22,14 +22,46 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.8.0 - 2025-04-15 ## 3.8.3 - 2025-07-01
## Added
- Support for link collections.
In "fancy" mode, a single HTML file with multiple links is generated.
In all other modes, PFERD creates a folder for the collection and a new file
for every link inside.
## Fixed
- Crawling of exercises with instructions
- Don't download unavailable elements.
Elements that are unavailable (for example, because their availability is
time restricted) will not download the HTML for the info page anymore.
- `base_url` argument for `ilias-web` crawler causing crashes
## 3.8.2 - 2025-04-29
## Changed
- Explicitly mention that wikis are not supported at the moment and ignore them
## Fixed
- Ilias-native login
- Exercise crawling
## 3.8.1 - 2025-04-17
## Fixed
- Description html files now specify at UTF-8 encoding
- Images in descriptions now always have a white background
## 3.8.0 - 2025-04-16
### Added ### Added
- Support for ILIAS 9 - Support for ILIAS 9
### Changed ### Changed
- Added prettier CSS to forum threads - Added prettier CSS to forum threads
- Downloaded forum threads now link to the forum instead of the ILIAS thread
- Increase minimum supported Python version to 3.11 - Increase minimum supported Python version to 3.11
- Do not crawl nested courses (courses linked in other courses)
## Fixed ## Fixed
- File links in report on Windows - File links in report on Windows

View File

@ -163,13 +163,14 @@ out of the box for the corresponding universities:
[ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs"
| University | `base_url` | `login_type` | `client_id` | | University | `base_url` | `login_type` | `client_id` |
|---------------|-----------------------------------------|--------------|---------------| |-----------------|-----------------------------------------|--------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | local | elearning | | FH Aachen | https://www.ili.fh-aachen.de | local | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk | | Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ | | Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart | | Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart |
| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | | | Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | |
| KIT ILIAS Pilot | https://pilot.ilias.studium.kit.edu | shibboleth | pilot |
If your university isn't listed, try navigating to your instance's login page. If your university isn't listed, try navigating to your instance's login page.
Assuming no custom login service is used, the URL will look something like this: Assuming no custom login service is used, the URL will look something like this:

View File

@ -45,8 +45,8 @@ def load(
load_crawler(args, section) load_crawler(args, section)
section["type"] = COMMAND_NAME section["type"] = COMMAND_NAME
if args.ilias_url is not None: if args.base_url is not None:
section["base_url"] = args.ilias_url section["base_url"] = args.base_url
if args.client_id is not None: if args.client_id is not None:
section["client_id"] = args.client_id section["client_id"] = args.client_id

View File

@ -1,3 +1,5 @@
import dataclasses
import re
from enum import Enum from enum import Enum
from typing import Optional, cast from typing import Optional, cast
@ -12,7 +14,9 @@ _link_template_fancy = """
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>ILIAS - Link: {{name}}</title> <title>ILIAS - Link: {{name}}</title>
<!-- REPEAT REMOVE START -->
<meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" />
<!-- REPEAT REMOVE END -->
</head> </head>
<style> <style>
@ -23,6 +27,8 @@ _link_template_fancy = """
display: flex; display: flex;
align-items: center; align-items: center;
justify-content: center; justify-content: center;
flex-direction: column;
gap: 4px;
} }
body { body {
padding: 0; padding: 0;
@ -31,11 +37,16 @@ _link_template_fancy = """
font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif;
height: 100vh; height: 100vh;
} }
.row { .column {
background-color: white;
min-width: 500px; min-width: 500px;
max-width: 90vw; max-width: 90vw;
display: flex; display: flex;
flex-direction: column;
row-gap: 5px;
}
.row {
background-color: white;
display: flex;
padding: 1em; padding: 1em;
} }
.logo { .logo {
@ -75,19 +86,23 @@ _link_template_fancy = """
} }
</style> </style>
<body class="center-flex"> <body class="center-flex">
<div class="row"> <div class="column">
<div class="logo center-flex"> <!-- REPEAT START -->
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> <div class="row">
<path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/> <div class="logo center-flex">
</svg> <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
</div> <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/>
<div class="tile"> </svg>
<div class="top-row">
<a href="{{link}}">{{name}}</a>
</div> </div>
<div class="bottom-row">{{description}}</div> <div class="tile">
<div class="top-row">
<a href="{{link}}">{{name}}</a>
</div>
<div class="bottom-row">{{description}}</div>
</div>
<div class="menu-button center-flex"> ⯆ </div>
</div> </div>
<div class="menu-button center-flex"> ⯆ </div> <!-- REPEAT END -->
</div> </div>
</body> </body>
</html> </html>
@ -255,6 +270,13 @@ def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Ta
.replace("{{content}}", cast(str, content.prettify())) .replace("{{content}}", cast(str, content.prettify()))
@dataclasses.dataclass
class LinkData:
name: str
url: str
description: str
class Links(Enum): class Links(Enum):
IGNORE = "ignore" IGNORE = "ignore"
PLAINTEXT = "plaintext" PLAINTEXT = "plaintext"
@ -272,6 +294,11 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def collection_as_one(self) -> bool:
if self == Links.FANCY:
return True
return False
def extension(self) -> Optional[str]: def extension(self) -> Optional[str]:
if self == Links.FANCY: if self == Links.FANCY:
return ".html" return ".html"
@ -283,10 +310,48 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def interpolate(self, redirect_delay: int, collection_name: str, links: list[LinkData]) -> str:
template = self.template()
if template is None:
raise ValueError("Cannot interpolate ignored links")
if len(links) == 1:
link = links[0]
content = template
content = content.replace("{{link}}", link.url)
content = content.replace("{{name}}", link.name)
content = content.replace("{{description}}", link.description)
content = content.replace("{{redirect_delay}}", str(redirect_delay))
return content
if self == Links.PLAINTEXT or self == Links.INTERNET_SHORTCUT:
return "\n".join(f"{link.url}" for link in links)
# All others get coerced to fancy
content = cast(str, Links.FANCY.template())
repeated_content = cast(
re.Match[str],
re.search(r"<!-- REPEAT START -->([\s\S]+)<!-- REPEAT END -->", content)
).group(1)
parts = []
for link in links:
instance = repeated_content
instance = instance.replace("{{link}}", link.url)
instance = instance.replace("{{name}}", link.name)
instance = instance.replace("{{description}}", link.description)
instance = instance.replace("{{redirect_delay}}", str(redirect_delay))
parts.append(instance)
content = content.replace(repeated_content, "\n".join(parts))
content = content.replace("{{name}}", collection_name)
content = re.sub(r"<!-- REPEAT REMOVE START -->[\s\S]+<!-- REPEAT REMOVE END -->", "", content)
return content
@staticmethod @staticmethod
def from_string(string: str) -> "Links": def from_string(string: str) -> "Links":
try: try:
return Links(string) return Links(string)
except ValueError: except ValueError:
raise ValueError("must be one of 'ignore', 'plaintext'," options = [f"'{option.value}'" for option in Links]
" 'html', 'internet-shortcut'") raise ValueError(f"must be one of {', '.join(options)}")

View File

@ -39,6 +39,10 @@ _STYLE_TAG_CONTENT = """
margin: 0.5rem 0; margin: 0.5rem 0;
} }
img {
background-color: white;
}
body { body {
padding: 1em; padding: 1em;
grid-template-columns: 1fr min(60rem, 90%) 1fr; grid-template-columns: 1fr min(60rem, 90%) 1fr;
@ -56,12 +60,11 @@ _ARTICLE_WORTHY_CLASSES = [
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
head = soup.new_tag("head") head = soup.new_tag("head")
soup.insert(0, head) soup.insert(0, head)
# Force UTF-8 encoding
head.append(soup.new_tag("meta", charset="utf-8"))
simplecss_link: Tag = soup.new_tag("link")
# <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
simplecss_link["rel"] = "stylesheet" head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css"))
simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
head.append(simplecss_link)
# Basic style tags for compat # Basic style tags for compat
style: Tag = soup.new_tag("style") style: Tag = soup.new_tag("style")

View File

@ -19,7 +19,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat from .async_helper import _iorepeat
from .file_templates import Links, forum_thread_template, learning_module_template from .file_templates import LinkData, Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export) IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
@ -107,6 +107,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = { _DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES, IliasElementType.EXERCISE_FILES,
IliasElementType.EXERCISE_OVERVIEW,
IliasElementType.FOLDER, IliasElementType.FOLDER,
IliasElementType.INFO_TAB, IliasElementType.INFO_TAB,
IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.MEDIACAST_VIDEO_FOLDER,
@ -216,11 +217,19 @@ instance's greatest bottleneck.
async def _crawl_desktop(self) -> None: async def _crawl_desktop(self) -> None:
await self._crawl_url( await self._crawl_url(
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show") urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"),
crawl_nested_courses=True
) )
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: async def _crawl_url(
if awaitable := await self._handle_ilias_page(url, None, PurePath("."), expected_id): self,
url: str,
expected_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> None:
if awaitable := await self._handle_ilias_page(
url, None, PurePath("."), expected_id, crawl_nested_courses
):
await awaitable await awaitable
async def _handle_ilias_page( async def _handle_ilias_page(
@ -229,6 +238,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
path: PurePath, path: PurePath,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path) maybe_cl = await self.crawl(path)
if not maybe_cl: if not maybe_cl:
@ -236,7 +246,9 @@ instance's greatest bottleneck.
if current_element: if current_element:
self._ensure_not_seen(current_element, path) self._ensure_not_seen(current_element, path)
return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id) return self._crawl_ilias_page(
url, current_element, maybe_cl, expected_course_id, crawl_nested_courses
)
@anoncritical @anoncritical
async def _crawl_ilias_page( async def _crawl_ilias_page(
@ -245,6 +257,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
cl: CrawlToken, cl: CrawlToken,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False,
) -> None: ) -> None:
elements: List[IliasPageElement] = [] elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes # A list as variable redefinitions are not propagated to outer scopes
@ -293,7 +306,7 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for element in elements: for element in elements:
if handle := await self._handle_ilias_element(cl.path, element): if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses):
tasks.append(asyncio.create_task(handle)) tasks.append(asyncio.create_task(handle))
# And execute them # And execute them
@ -309,12 +322,22 @@ instance's greatest bottleneck.
self, self,
parent_path: PurePath, parent_path: PurePath,
element: IliasPageElement, element: IliasPageElement,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
# element.name might contain `/` if the crawler created nested elements, # element.name might contain `/` if the crawler created nested elements,
# so we can not sanitize it here. We trust in the output dir to thwart worst-case # so we can not sanitize it here. We trust in the output dir to thwart worst-case
# directory escape attacks. # directory escape attacks.
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
# This is symptomatic of no access to the element, for example, because
# of time availability restrictions.
if "cmdClass=ilInfoScreenGUI" in element.url and "cmd=showSummary" in element.url:
log.explain(
"Skipping element as url points to info screen, "
"this should only happen with not-yet-released elements"
)
return None
if element.type in _VIDEO_ELEMENTS: if element.type in _VIDEO_ELEMENTS:
if not self._videos: if not self._videos:
log.status( log.status(
@ -402,17 +425,29 @@ instance's greatest bottleneck.
) )
return None return None
elif element.type == IliasElementType.COURSE: elif element.type == IliasElementType.COURSE:
if crawl_nested_courses:
return await self._handle_ilias_page(element.url, element, element_path)
log.status( log.status(
"[bold bright_black]", "[bold bright_black]",
"Ignored", "Ignored",
fmt_path(element_path), fmt_path(element_path),
"[bright_black](not descending into linked course, download it separately)" "[bright_black](not descending into linked course)"
)
return None
elif element.type == IliasElementType.WIKI:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](wikis are not currently supported)"
) )
return None return None
elif element.type == IliasElementType.LEARNING_MODULE: elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path) return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path) return await self._handle_link(element, element_path)
elif element.type == IliasElementType.LINK_COLLECTION:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING: elif element.type == IliasElementType.BOOKING:
return await self._handle_booking(element, element_path) return await self._handle_booking(element, element_path)
elif element.type == IliasElementType.OPENCAST_VIDEO: elif element.type == IliasElementType.OPENCAST_VIDEO:
@ -438,44 +473,97 @@ instance's greatest bottleneck.
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}") log.explain(f"Links type is {self._links}")
link_template_maybe = self._links.template() export_url = url_set_query_param(element.url, "cmd", "exportHTML")
link_extension = self._links.extension() resolved = await self._resolve_link_target(export_url)
if not link_template_maybe or not link_extension: if resolved == "none":
links = [LinkData(element.name, "", element.description or "")]
else:
links = self._parse_link_content(element, cast(BeautifulSoup, resolved))
maybe_extension = self._links.extension()
if not maybe_extension:
log.explain("Answer: No") log.explain("Answer: No")
return None return None
else: else:
log.explain("Answer: Yes") log.explain("Answer: Yes")
element_path = element_path.with_name(element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime) if len(links) <= 1 or self._links.collection_as_one():
if not maybe_dl: element_path = element_path.with_name(element_path.name + maybe_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
return None
return self._download_link(self._links, element.name, links, maybe_dl)
maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None return None
# Required for download_all closure
cl = maybe_cl
extension = maybe_extension
return self._download_link(element, link_template_maybe, maybe_dl) async def download_all() -> None:
for link in links:
path = cl.path / (_sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl)
return download_all()
@anoncritical @anoncritical
@_iorepeat(3, "resolving link") @_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async def _download_link(
async with dl as (bar, sink):
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url)
self._write_link_content(link_template, real_url, element.name, element.description, sink)
def _write_link_content(
self, self,
link_template: str, link_renderer: Links,
url: str, collection_name: str,
name: str, links: list[LinkData],
description: Optional[str], dl: DownloadToken
sink: FileSink,
) -> None: ) -> None:
content = link_template async with dl as (bar, sink):
content = content.replace("{{link}}", url) rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links)
content = content.replace("{{name}}", name) sink.file.write(rendered.encode("utf-8"))
content = content.replace("{{description}}", str(description)) sink.done()
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8")) async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
sink.done() async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()) # .select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return "none"
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
@staticmethod
def _parse_link_content(element: IliasPageElement, content: BeautifulSoup) -> list[LinkData]:
links = cast(list[Tag], list(content.select("a")))
if len(links) == 1:
url = str(links[0].get("href")).strip()
return [LinkData(name=element.name, description=element.description or "", url=url)]
results = []
for link in links:
url = str(link.get("href")).strip()
name = link.get_text(strip=True)
description = cast(Tag, link.find_next_sibling("dd")).get_text(strip=True)
results.append(LinkData(name=name, description=description, url=url.strip()))
return results
async def _handle_booking( async def _handle_booking(
self, self,
@ -500,7 +588,7 @@ instance's greatest bottleneck.
self._ensure_not_seen(element, element_path) self._ensure_not_seen(element, element_path)
return self._download_booking(element, link_template_maybe, maybe_dl) return self._download_booking(element, maybe_dl)
@anoncritical @anoncritical
@_iorepeat(1, "downloading description") @_iorepeat(1, "downloading description")
@ -521,36 +609,13 @@ instance's greatest bottleneck.
async def _download_booking( async def _download_booking(
self, self,
element: IliasPageElement, element: IliasPageElement,
link_template: str,
dl: DownloadToken, dl: DownloadToken,
) -> None: ) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
self._write_link_content(link_template, element.url, element.name, element.description, sink) links = [LinkData(name=element.name, description=element.description or "", url=element.url)]
rendered = self._links.interpolate(self._link_file_redirect_delay, element.name, links)
async def _resolve_link_target(self, export_url: str) -> str: sink.file.write(rendered.encode("utf-8"))
async def impl() -> Optional[str]: sink.done()
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return ""
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
async def _handle_opencast_video( async def _handle_opencast_video(
self, self,
@ -759,70 +824,23 @@ instance's greatest bottleneck.
@_iorepeat(3, "crawling forum") @_iorepeat(3, "crawling forum")
@anoncritical @anoncritical
async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements: List[IliasForumThread] = []
async with cl: async with cl:
next_stage_url = element.url inner = IliasPage(await self._get_page(element.url), element)
page = None export_url = inner.get_forum_export_url()
if not export_url:
while next_stage_url: log.warn("Could not extract forum export url")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url)
page = IliasPage(soup, element)
if next := page.get_next_stage_element():
next_stage_url = next.url
else:
break
forum_threads: list[tuple[IliasPageElement, bool]] = []
for entry in cast(IliasPage, page).get_forum_entries():
path = cl.path / (_sanitize_path_name(entry.name) + ".html")
forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
# Sort the ids. The forum download will *preserve* this ordering
forum_threads.sort(key=lambda elem: elem[0].id())
if not forum_threads:
log.explain("Forum had no threads")
return return
download_data = cast(IliasPage, page).get_download_forum_data( export = await self._post(export_url, {
[thread.id() for thread, download in forum_threads if download] "format": "html",
) "cmd[createExportFile]": ""
if not download_data: })
raise CrawlWarning("Failed to extract forum data")
if not download_data.empty: elements = parse_ilias_forum_export(soupify(export))
html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html))
else:
elements = []
# Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
# download in the correct order, potentially messing up duplication handling.
expected_element_titles = [thread.name for thread, download in forum_threads if download]
actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
if expected_element_titles != actual_element_titles:
raise CrawlWarning(
f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
)
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for thread, download in forum_threads: for thread in elements:
if download: tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url)))
# This only works because ILIAS keeps the order in the export
elem = elements.pop(0)
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem, thread)))
else:
# We only downloaded the threads we "should_try_download"ed. This can be an
# over-approximation and all will be fine.
# If we selected too few, e.g. because there was a duplicate title and the mtime of the
# original is newer than the update of the duplicate.
# This causes stale data locally, but I consider this problem acceptable right now.
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, thread)))
# And execute them # And execute them
await self.gather(tasks) await self.gather(tasks)
@ -833,7 +851,7 @@ instance's greatest bottleneck.
self, self,
parent_path: PurePath, parent_path: PurePath,
thread: Union[IliasForumThread, IliasPageElement], thread: Union[IliasForumThread, IliasPageElement],
element: IliasPageElement forum_url: str
) -> None: ) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html") path = parent_path / (_sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime) maybe_dl = await self.download(path, mtime=thread.mtime)
@ -843,7 +861,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
rendered = forum_thread_template( rendered = forum_thread_template(
thread.name, thread.name,
element.url, forum_url,
thread.name_tag, thread.name_tag,
await self.internalize_images(thread.content_tag) await self.internalize_images(thread.content_tag)
) )
@ -1021,29 +1039,19 @@ instance's greatest bottleneck.
) )
return soup return soup
async def _post_authenticated( async def _post(
self, self,
url: str, url: str,
data: dict[str, Union[str, List[str]]] data: dict[str, Union[str, List[str]]]
) -> bytes: ) -> bytes:
auth_id = await self._current_auth_id()
form_data = aiohttp.FormData() form_data = aiohttp.FormData()
for key, val in data.items(): for key, val in data.items():
form_data.add_field(key, val) form_data.add_field(key, val)
async with self.session.post(url, data=form_data(), allow_redirects=False) as request: async with self.session.post(url, data=form_data()) as request:
if request.status == 200: if request.status == 200:
return await request.read() return await request.read()
raise CrawlError(f"post failed with status {request.status}")
# We weren't authenticated, so try to do that
await self.authenticate(auth_id)
# Retry once after authenticating. If this fails, we will die.
async with self.session.post(url, data=data, allow_redirects=False) as request:
if request.status == 200:
return await request.read()
raise CrawlError("post_authenticated failed even after authenticating")
async def _get_authenticated(self, url: str) -> bytes: async def _get_authenticated(self, url: str) -> bytes:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
@ -1073,7 +1081,7 @@ instance's greatest bottleneck.
async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request:
login_page = soupify(await request.read()) login_page = soupify(await request.read())
login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"})) login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "login_form"}))
if login_form is None: if login_form is None:
raise CrawlError("Could not find the login form! Specified client id might be invalid.") raise CrawlError("Could not find the login form! Specified client id might be invalid.")
@ -1083,14 +1091,12 @@ instance's greatest bottleneck.
username, password = await self._auth.credentials() username, password = await self._auth.credentials()
login_data = { login_form_data = aiohttp.FormData()
"username": username, login_form_data.add_field('login_form/input_3/input_4', username)
"password": password, login_form_data.add_field('login_form/input_3/input_5', password)
"cmd[doStandardAuthentication]": "Login",
}
# do the actual login # do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request: async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request:
soup = IliasSoup(soupify(await request.read()), str(request.url)) soup = IliasSoup(soupify(await request.read()), str(request.url))
if not IliasPage.is_logged_in(soup): if not IliasPage.is_logged_in(soup):
self._auth.invalidate_credentials() self._auth.invalidate_credentials()

View File

@ -97,7 +97,8 @@ class IliasElementType(Enum):
BOOKING = "booking" BOOKING = "booking"
COURSE = "course" COURSE = "course"
DCL_RECORD_LIST = "dcl_record_list" DCL_RECORD_LIST = "dcl_record_list"
EXERCISE = "exercise" EXERCISE_OVERVIEW = "exercise_overview"
EXERCISE = "exercise" # own submitted files
EXERCISE_FILES = "exercise_files" # own submitted files EXERCISE_FILES = "exercise_files" # own submitted files
FILE = "file" FILE = "file"
FOLDER = "folder" FOLDER = "folder"
@ -108,6 +109,7 @@ class IliasElementType(Enum):
LEARNING_MODULE_HTML = "learning_module_html" LEARNING_MODULE_HTML = "learning_module_html"
LITERATURE_LIST = "literature_list" LITERATURE_LIST = "literature_list"
LINK = "link" LINK = "link"
LINK_COLLECTION = "link_collection"
MEDIA_POOL = "media_pool" MEDIA_POOL = "media_pool"
MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO = "mediacast_video"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
@ -120,6 +122,7 @@ class IliasElementType(Enum):
SCORM_LEARNING_MODULE = "scorm_learning_module" SCORM_LEARNING_MODULE = "scorm_learning_module"
SURVEY = "survey" SURVEY = "survey"
TEST = "test" # an online test. Will be ignored currently. TEST = "test" # an online test. Will be ignored currently.
WIKI = "wiki"
def matcher(self) -> IliasElementMatcher: def matcher(self) -> IliasElementMatcher:
match self: match self:
@ -140,13 +143,15 @@ class IliasElementType(Enum):
TypeMatcher.query("cmdclass=ildclrecordlistgui") TypeMatcher.query("cmdclass=ildclrecordlistgui")
) )
case IliasElementType.EXERCISE: case IliasElementType.EXERCISE:
return TypeMatcher.never()
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.EXERCISE_OVERVIEW:
return TypeMatcher.any( return TypeMatcher.any(
TypeMatcher.path("/exc/"), TypeMatcher.path("/exc/"),
TypeMatcher.path("_exc_"), TypeMatcher.path("_exc_"),
TypeMatcher.img_src("_exc.svg"), TypeMatcher.img_src("_exc.svg"),
) )
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.FILE: case IliasElementType.FILE:
return TypeMatcher.any( return TypeMatcher.any(
TypeMatcher.query("cmd=sendfile"), TypeMatcher.query("cmd=sendfile"),
@ -198,7 +203,12 @@ class IliasElementType(Enum):
TypeMatcher.query("baseclass=illinkresourcehandlergui"), TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.query("calldirectlink"), TypeMatcher.query("calldirectlink"),
), ),
TypeMatcher.img_src("_webr.svg") TypeMatcher.img_src("_webr.svg") # duplicated :(
)
case IliasElementType.LINK_COLLECTION:
return TypeMatcher.any(
TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.img_src("_webr.svg") # duplicated :(
) )
case IliasElementType.MEDIA_POOL: case IliasElementType.MEDIA_POOL:
return TypeMatcher.any( return TypeMatcher.any(
@ -243,6 +253,11 @@ class IliasElementType(Enum):
TypeMatcher.query("cmdclass=iltestscreengui"), TypeMatcher.query("cmdclass=iltestscreengui"),
TypeMatcher.img_src("_tst.svg") TypeMatcher.img_src("_tst.svg")
) )
case IliasElementType.WIKI:
return TypeMatcher.any(
TypeMatcher.query("baseClass=ilwikihandlergui"),
TypeMatcher.img_src("wiki.svg")
)
raise CrawlWarning(f"Unknown matcher {self}") raise CrawlWarning(f"Unknown matcher {self}")
@ -271,6 +286,7 @@ class IliasPageElement:
r"mcst/(?P<id>\d+)", # mediacast r"mcst/(?P<id>\d+)", # mediacast
r"pg/(?P<id>(\d|_)+)", # page? r"pg/(?P<id>(\d|_)+)", # page?
r"svy/(?P<id>\d+)", # survey r"svy/(?P<id>\d+)", # survey
r"sess/(?P<id>\d+)", # session
r"webr/(?P<id>\d+)", # web referene (link) r"webr/(?P<id>\d+)", # web referene (link)
r"thr_pk=(?P<id>\d+)", # forums r"thr_pk=(?P<id>\d+)", # forums
r"ref_id=(?P<id>\d+)", r"ref_id=(?P<id>\d+)",
@ -489,79 +505,31 @@ class IliasPage:
return url return url
return None return None
def get_forum_entries(self) -> list[IliasPageElement]: def get_forum_export_url(self) -> Optional[str]:
form = self._get_forum_form() forum_link = self._soup.select_one("#tab_forums_threads > a")
if not form: if not forum_link:
return [] log.explain("Found no forum link")
threads = []
for row in cast(list[Tag], form.select("table > tbody > tr")):
url_tag = cast(
Optional[Tag],
row.find(name="a", attrs={"href": lambda x: x is not None and "cmd=viewthread" in x.lower()})
)
if url_tag is None:
log.explain(f"Skipping row without URL: {row}")
continue
name = url_tag.get_text().strip()
columns = [td.get_text().strip() for td in cast(list[Tag], row.find_all(name="td"))]
potential_dates_opt = [IliasPage._find_date_in_text(column) for column in columns]
potential_dates = [x for x in potential_dates_opt if x is not None]
mtime = max(potential_dates) if potential_dates else None
threads.append(IliasPageElement.create_new(
IliasElementType.FORUM_THREAD,
self._abs_url_from_link(url_tag),
name,
mtime=mtime
))
return threads
def get_download_forum_data(self, thread_ids: list[str]) -> Optional[IliasDownloadForumData]:
form = cast(Optional[Tag], self._soup.find(
"form",
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
))
if not form:
return None return None
post_url = self._abs_url_from_relative(cast(str, form["action"]))
log.explain(f"Fetching forum threads {thread_ids}") base_url = self._abs_url_from_link(forum_link)
base_url = re.sub(r"cmd=\w+", "cmd=post", base_url)
base_url = re.sub(r"cmdClass=\w+", "cmdClass=ilExportGUI", base_url)
form_data: Dict[str, Union[str, list[str]]] = { rtoken_form = cast(
"thread_ids[]": cast(list[str], thread_ids), Optional[Tag],
"selected_cmd2": "html", self._soup.find("form", attrs={"action": lambda x: x is not None and "rtoken=" in x})
"select_cmd2": "Ausführen", )
"selected_cmd": "", if not rtoken_form:
} log.explain("Found no rtoken anywhere")
return None
match = cast(re.Match[str], re.search(r"rtoken=(\w+)", str(rtoken_form.attrs["action"])))
rtoken = match.group(1)
return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) base_url = base_url + "&rtoken=" + rtoken
def _get_forum_form(self) -> Optional[Tag]: return base_url
return cast(Optional[Tag], self._soup.find(
"form",
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
))
def get_next_stage_element(self) -> Optional[IliasPageElement]: def get_next_stage_element(self) -> Optional[IliasPageElement]:
if self._is_forum_page():
if "trows=" in self._page_url:
log.explain("Manual row override detected, accepting it as good")
return None
log.explain("Requesting *all* forum threads")
thread_count = self._get_forum_thread_count()
if thread_count is not None and thread_count > 400:
log.warn(
"Forum has more than 400 threads, fetching all threads will take a while. "
"You might need to adjust your http_timeout config option."
)
# Fetch at least 400 in case we detect it wrong
if thread_count is not None and thread_count < 400:
thread_count = 400
return self._get_show_max_forum_entries_per_page_url(thread_count)
if self._is_ilias_opencast_embedding(): if self._is_ilias_opencast_embedding():
log.explain("Unwrapping opencast embedding") log.explain("Unwrapping opencast embedding")
return self.get_child_elements()[0] return self.get_child_elements()[0]
@ -571,6 +539,8 @@ class IliasPage:
if self._contains_collapsed_future_meetings(): if self._contains_collapsed_future_meetings():
log.explain("Requesting *all* future meetings") log.explain("Requesting *all* future meetings")
return self._uncollapse_future_meetings_url() return self._uncollapse_future_meetings_url()
if self._is_exercise_not_all_shown():
return self._show_all_exercises()
if not self._is_content_tab_selected(): if not self._is_content_tab_selected():
if self._page_type != IliasElementType.INFO_TAB: if self._page_type != IliasElementType.INFO_TAB:
log.explain("Selecting content tab") log.explain("Selecting content tab")
@ -579,11 +549,6 @@ class IliasPage:
log.explain("Crawling info tab, skipping content select") log.explain("Crawling info tab, skipping content select")
return None return None
def _is_forum_page(self) -> bool:
if perma_link := self.get_permalink():
return "/frm/" in perma_link
return False
def _is_video_player(self) -> bool: def _is_video_player(self) -> bool:
return "paella_config_file" in str(self._soup) return "paella_config_file" in str(self._soup)
@ -607,7 +572,7 @@ class IliasPage:
def _is_exercise_file(self) -> bool: def _is_exercise_file(self) -> bool:
# we know it from before # we know it from before
if self._page_type == IliasElementType.EXERCISE: if self._page_type == IliasElementType.EXERCISE_OVERVIEW:
return True return True
# We have no suitable parent - let's guesss # We have no suitable parent - let's guesss
@ -644,6 +609,17 @@ class IliasPage:
link = self._abs_url_from_link(element) link = self._abs_url_from_link(element)
return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings") return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings")
def _is_exercise_not_all_shown(self) -> bool:
return (self._page_type == IliasElementType.EXERCISE_OVERVIEW
and "mode=all" not in self._page_url.lower())
def _show_all_exercises(self) -> Optional[IliasPageElement]:
return IliasPageElement.create_new(
IliasElementType.EXERCISE_OVERVIEW,
self._page_url + "&mode=all",
"show all exercises"
)
def _is_content_tab_selected(self) -> bool: def _is_content_tab_selected(self) -> bool:
return self._select_content_page_url() is None return self._select_content_page_url() is None
@ -909,15 +885,62 @@ class IliasPage:
def _find_exercise_entries(self) -> list[IliasPageElement]: def _find_exercise_entries(self) -> list[IliasPageElement]:
if self._soup.find(id="tab_submission"): if self._soup.find(id="tab_submission"):
log.explain("Found submission tab. This is an exercise detail page") log.explain("Found submission tab. This is an exercise detail or files page")
return self._find_exercise_entries_detail_page() if self._soup.select_one("#tab_submission.active") is None:
log.explain(" This is a details page")
return self._find_exercise_entries_detail_page()
else:
log.explain(" This is a files page")
return self._find_exercise_entries_files_page()
log.explain("Found no submission tab. This is an exercise root page") log.explain("Found no submission tab. This is an exercise root page")
return self._find_exercise_entries_root_page() return self._find_exercise_entries_root_page()
def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]: def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = [] results: list[IliasPageElement] = []
# Find all download links in the container (this will contain all the files) if link := cast(Optional[Tag], self._soup.select_one("#tab_submission > a")):
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
self._abs_url_from_link(link),
"Submission"
))
else:
log.explain("Found no submission link for exercise, maybe it has not started yet?")
# Find all download links in the container (this will contain all the *feedback* files)
download_links = cast(list[Tag], self._soup.find_all(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x is not None and "cmd=download" in x},
text="Download"
))
for link in download_links:
parent_row: Tag = cast(Tag, link.find_parent(
attrs={"class": lambda x: x is not None and "row" in x}))
name_tag = cast(Optional[Tag], parent_row.find(name="div"))
if not name_tag:
log.warn("Could not find name tag for exercise entry")
_unexpected_html_warning()
continue
name = _sanitize_path_name(name_tag.get_text().strip())
log.explain(f"Found exercise detail entry {name!r}")
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
self._abs_url_from_link(link),
name
))
return results
def _find_exercise_entries_files_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = []
# Find all download links in the container
download_links = cast(list[Tag], self._soup.find_all( download_links = cast(list[Tag], self._soup.find_all(
name="a", name="a",
# download links contain the given command class # download links contain the given command class
@ -930,7 +953,7 @@ class IliasPage:
children = cast(list[Tag], parent_row.find_all("td")) children = cast(list[Tag], parent_row.find_all("td"))
name = _sanitize_path_name(children[1].get_text().strip()) name = _sanitize_path_name(children[1].get_text().strip())
log.explain(f"Found exercise detail entry {name!r}") log.explain(f"Found exercise file entry {name!r}")
date = None date = None
for child in reversed(children): for child in reversed(children):
@ -938,7 +961,7 @@ class IliasPage:
if date is not None: if date is not None:
break break
if date is None: if date is None:
log.warn(f"Date parsing failed for exercise entry {name!r}") log.warn(f"Date parsing failed for exercise file entry {name!r}")
results.append(IliasPageElement.create_new( results.append(IliasPageElement.create_new(
IliasElementType.FILE, IliasElementType.FILE,
@ -952,66 +975,32 @@ class IliasPage:
def _find_exercise_entries_root_page(self) -> list[IliasPageElement]: def _find_exercise_entries_root_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = [] results: list[IliasPageElement] = []
# Each assignment is in an accordion container content_tab = cast(Optional[Tag], self._soup.find(id="ilContentContainer"))
assignment_containers: list[Tag] = self._soup.select(".il_VAccordionInnerContainer") if not content_tab:
log.warn("Could not find content tab in exercise overview page")
_unexpected_html_warning()
return []
for container in assignment_containers: exercise_links = content_tab.select(".il-item-title a")
# Fetch the container name out of the header to use it in the path
container_name = cast(Tag, container.select_one(".ilAssignmentHeader")).get_text().strip()
log.explain(f"Found exercise container {container_name!r}")
# Find all download links in the container (this will contain all the files) for exercise in cast(list[Tag], exercise_links):
files = cast(list[Tag], container.find_all( if "href" not in exercise.attrs:
name="a", continue
# download links contain the given command class href = exercise.attrs["href"]
attrs={"href": lambda x: x is not None and "cmdClass=ilexsubmissiongui" in x}, if type(href) is not str:
text="Download" continue
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue
name = _sanitize_path_name(exercise.get_text().strip())
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE,
self._abs_url_from_link(exercise),
name
)) ))
# Grab each file as you now have the link for result in results:
for file_link in files: log.explain(f"Found exercise {result.name!r}")
# Two divs, side by side. Left is the name, right is the link ==> get left
# sibling
file_name = cast(
Tag,
cast(Tag, file_link.parent).find_previous(name="div")
).get_text().strip()
url = self._abs_url_from_link(file_link)
log.explain(f"Found exercise entry {file_name!r}")
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
url,
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
mtime=None, # We do not have any timestamp
skip_sanitize=True
))
# Find all links to file listings (e.g. "Submitted Files" for groups)
file_listings = cast(list[Tag], container.find_all(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x is not None and "cmdclass=ilexsubmissionfilegui" in x.lower()}
))
# Add each listing as a new
for listing in file_listings:
parent_container = cast(Tag, listing.find_parent(
"div", attrs={"class": lambda x: x is not None and "form-group" in x}
))
label_container = cast(Tag, parent_container.find(
attrs={"class": lambda x: x is not None and "control-label" in x}
))
file_name = label_container.get_text().strip()
url = self._abs_url_from_link(listing)
log.explain(f"Found exercise detail {file_name!r} at {url}")
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
url,
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
None, # we do not have any timestamp
skip_sanitize=True
))
return results return results
@ -1127,7 +1116,7 @@ class IliasPage:
videos.append(IliasPageElement.create_new( videos.append(IliasPageElement.create_new(
typ=IliasElementType.MOB_VIDEO, typ=IliasElementType.MOB_VIDEO,
url=self._abs_url_from_relative(url), url=url,
name=_sanitize_path_name(title), name=_sanitize_path_name(title),
mtime=None mtime=None
)) ))
@ -1153,6 +1142,9 @@ class IliasPage:
else: else:
title = f"unknown video {figure}" title = f"unknown video {figure}"
if url:
url = self._abs_url_from_relative(url)
return url, title return url, title
def _is_in_expanded_meeting(self, tag: Tag) -> bool: def _is_in_expanded_meeting(self, tag: Tag) -> bool:

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.8.0" VERSION = "3.8.3"