Compare commits

..

3 Commits

Author SHA1 Message Date
7291382430 Bump version to 3.8.0 2025-04-15 11:32:22 +02:00
1a430ad5d1 Update minimum Python version to 3.11 2025-04-15 11:31:39 +02:00
f6bdeb6b9d Support ILIAS 9 2025-04-15 11:19:53 +02:00
8 changed files with 310 additions and 409 deletions

View File

@ -22,46 +22,14 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.8.3 - 2025-07-01 ## 3.8.0 - 2025-04-15
## Added
- Support for link collections.
In "fancy" mode, a single HTML file with multiple links is generated.
In all other modes, PFERD creates a folder for the collection and a new file
for every link inside.
## Fixed
- Crawling of exercises with instructions
- Don't download unavailable elements.
Elements that are unavailable (for example, because their availability is
time restricted) will not download the HTML for the info page anymore.
- `base_url` argument for `ilias-web` crawler causing crashes
## 3.8.2 - 2025-04-29
## Changed
- Explicitly mention that wikis are not supported at the moment and ignore them
## Fixed
- Ilias-native login
- Exercise crawling
## 3.8.1 - 2025-04-17
## Fixed
- Description html files now specify at UTF-8 encoding
- Images in descriptions now always have a white background
## 3.8.0 - 2025-04-16
### Added ### Added
- Support for ILIAS 9 - Support for ILIAS 9
### Changed ### Changed
- Added prettier CSS to forum threads - Added prettier CSS to forum threads
- Downloaded forum threads now link to the forum instead of the ILIAS thread
- Increase minimum supported Python version to 3.11 - Increase minimum supported Python version to 3.11
- Do not crawl nested courses (courses linked in other courses)
## Fixed ## Fixed
- File links in report on Windows - File links in report on Windows

View File

@ -164,13 +164,12 @@ out of the box for the corresponding universities:
[ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs"
| University | `base_url` | `login_type` | `client_id` | | University | `base_url` | `login_type` | `client_id` |
|-----------------|-----------------------------------------|--------------|---------------| |---------------|-----------------------------------------|--------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | local | elearning | | FH Aachen | https://www.ili.fh-aachen.de | local | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk | | Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ | | Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart | | Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart |
| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | | | Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | |
| KIT ILIAS Pilot | https://pilot.ilias.studium.kit.edu | shibboleth | pilot |
If your university isn't listed, try navigating to your instance's login page. If your university isn't listed, try navigating to your instance's login page.
Assuming no custom login service is used, the URL will look something like this: Assuming no custom login service is used, the URL will look something like this:

View File

@ -45,8 +45,8 @@ def load(
load_crawler(args, section) load_crawler(args, section)
section["type"] = COMMAND_NAME section["type"] = COMMAND_NAME
if args.base_url is not None: if args.ilias_url is not None:
section["base_url"] = args.base_url section["base_url"] = args.ilias_url
if args.client_id is not None: if args.client_id is not None:
section["client_id"] = args.client_id section["client_id"] = args.client_id

View File

@ -1,5 +1,3 @@
import dataclasses
import re
from enum import Enum from enum import Enum
from typing import Optional, cast from typing import Optional, cast
@ -14,9 +12,7 @@ _link_template_fancy = """
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>ILIAS - Link: {{name}}</title> <title>ILIAS - Link: {{name}}</title>
<!-- REPEAT REMOVE START -->
<meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" />
<!-- REPEAT REMOVE END -->
</head> </head>
<style> <style>
@ -27,8 +23,6 @@ _link_template_fancy = """
display: flex; display: flex;
align-items: center; align-items: center;
justify-content: center; justify-content: center;
flex-direction: column;
gap: 4px;
} }
body { body {
padding: 0; padding: 0;
@ -37,15 +31,10 @@ _link_template_fancy = """
font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif;
height: 100vh; height: 100vh;
} }
.column {
min-width: 500px;
max-width: 90vw;
display: flex;
flex-direction: column;
row-gap: 5px;
}
.row { .row {
background-color: white; background-color: white;
min-width: 500px;
max-width: 90vw;
display: flex; display: flex;
padding: 1em; padding: 1em;
} }
@ -86,8 +75,6 @@ _link_template_fancy = """
} }
</style> </style>
<body class="center-flex"> <body class="center-flex">
<div class="column">
<!-- REPEAT START -->
<div class="row"> <div class="row">
<div class="logo center-flex"> <div class="logo center-flex">
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
@ -102,8 +89,6 @@ _link_template_fancy = """
</div> </div>
<div class="menu-button center-flex"> ⯆ </div> <div class="menu-button center-flex"> ⯆ </div>
</div> </div>
<!-- REPEAT END -->
</div>
</body> </body>
</html> </html>
""".strip() # noqa: E501 line too long """.strip() # noqa: E501 line too long
@ -270,13 +255,6 @@ def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Ta
.replace("{{content}}", cast(str, content.prettify())) .replace("{{content}}", cast(str, content.prettify()))
@dataclasses.dataclass
class LinkData:
name: str
url: str
description: str
class Links(Enum): class Links(Enum):
IGNORE = "ignore" IGNORE = "ignore"
PLAINTEXT = "plaintext" PLAINTEXT = "plaintext"
@ -294,11 +272,6 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def collection_as_one(self) -> bool:
if self == Links.FANCY:
return True
return False
def extension(self) -> Optional[str]: def extension(self) -> Optional[str]:
if self == Links.FANCY: if self == Links.FANCY:
return ".html" return ".html"
@ -310,48 +283,10 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def interpolate(self, redirect_delay: int, collection_name: str, links: list[LinkData]) -> str:
template = self.template()
if template is None:
raise ValueError("Cannot interpolate ignored links")
if len(links) == 1:
link = links[0]
content = template
content = content.replace("{{link}}", link.url)
content = content.replace("{{name}}", link.name)
content = content.replace("{{description}}", link.description)
content = content.replace("{{redirect_delay}}", str(redirect_delay))
return content
if self == Links.PLAINTEXT or self == Links.INTERNET_SHORTCUT:
return "\n".join(f"{link.url}" for link in links)
# All others get coerced to fancy
content = cast(str, Links.FANCY.template())
repeated_content = cast(
re.Match[str],
re.search(r"<!-- REPEAT START -->([\s\S]+)<!-- REPEAT END -->", content)
).group(1)
parts = []
for link in links:
instance = repeated_content
instance = instance.replace("{{link}}", link.url)
instance = instance.replace("{{name}}", link.name)
instance = instance.replace("{{description}}", link.description)
instance = instance.replace("{{redirect_delay}}", str(redirect_delay))
parts.append(instance)
content = content.replace(repeated_content, "\n".join(parts))
content = content.replace("{{name}}", collection_name)
content = re.sub(r"<!-- REPEAT REMOVE START -->[\s\S]+<!-- REPEAT REMOVE END -->", "", content)
return content
@staticmethod @staticmethod
def from_string(string: str) -> "Links": def from_string(string: str) -> "Links":
try: try:
return Links(string) return Links(string)
except ValueError: except ValueError:
options = [f"'{option.value}'" for option in Links] raise ValueError("must be one of 'ignore', 'plaintext',"
raise ValueError(f"must be one of {', '.join(options)}") " 'html', 'internet-shortcut'")

View File

@ -39,10 +39,6 @@ _STYLE_TAG_CONTENT = """
margin: 0.5rem 0; margin: 0.5rem 0;
} }
img {
background-color: white;
}
body { body {
padding: 1em; padding: 1em;
grid-template-columns: 1fr min(60rem, 90%) 1fr; grid-template-columns: 1fr min(60rem, 90%) 1fr;
@ -60,11 +56,12 @@ _ARTICLE_WORTHY_CLASSES = [
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
head = soup.new_tag("head") head = soup.new_tag("head")
soup.insert(0, head) soup.insert(0, head)
# Force UTF-8 encoding
head.append(soup.new_tag("meta", charset="utf-8"))
simplecss_link: Tag = soup.new_tag("link")
# <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css")) simplecss_link["rel"] = "stylesheet"
simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
head.append(simplecss_link)
# Basic style tags for compat # Basic style tags for compat
style: Tag = soup.new_tag("style") style: Tag = soup.new_tag("style")

View File

@ -19,7 +19,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat from .async_helper import _iorepeat
from .file_templates import LinkData, Links, forum_thread_template, learning_module_template from .file_templates import Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export) IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
@ -107,7 +107,6 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = { _DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES, IliasElementType.EXERCISE_FILES,
IliasElementType.EXERCISE_OVERVIEW,
IliasElementType.FOLDER, IliasElementType.FOLDER,
IliasElementType.INFO_TAB, IliasElementType.INFO_TAB,
IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.MEDIACAST_VIDEO_FOLDER,
@ -217,19 +216,11 @@ instance's greatest bottleneck.
async def _crawl_desktop(self) -> None: async def _crawl_desktop(self) -> None:
await self._crawl_url( await self._crawl_url(
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"), urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show")
crawl_nested_courses=True
) )
async def _crawl_url( async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
self, if awaitable := await self._handle_ilias_page(url, None, PurePath("."), expected_id):
url: str,
expected_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> None:
if awaitable := await self._handle_ilias_page(
url, None, PurePath("."), expected_id, crawl_nested_courses
):
await awaitable await awaitable
async def _handle_ilias_page( async def _handle_ilias_page(
@ -238,7 +229,6 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
path: PurePath, path: PurePath,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path) maybe_cl = await self.crawl(path)
if not maybe_cl: if not maybe_cl:
@ -246,9 +236,7 @@ instance's greatest bottleneck.
if current_element: if current_element:
self._ensure_not_seen(current_element, path) self._ensure_not_seen(current_element, path)
return self._crawl_ilias_page( return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id)
url, current_element, maybe_cl, expected_course_id, crawl_nested_courses
)
@anoncritical @anoncritical
async def _crawl_ilias_page( async def _crawl_ilias_page(
@ -257,7 +245,6 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
cl: CrawlToken, cl: CrawlToken,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False,
) -> None: ) -> None:
elements: List[IliasPageElement] = [] elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes # A list as variable redefinitions are not propagated to outer scopes
@ -306,7 +293,7 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for element in elements: for element in elements:
if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses): if handle := await self._handle_ilias_element(cl.path, element):
tasks.append(asyncio.create_task(handle)) tasks.append(asyncio.create_task(handle))
# And execute them # And execute them
@ -322,22 +309,12 @@ instance's greatest bottleneck.
self, self,
parent_path: PurePath, parent_path: PurePath,
element: IliasPageElement, element: IliasPageElement,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
# element.name might contain `/` if the crawler created nested elements, # element.name might contain `/` if the crawler created nested elements,
# so we can not sanitize it here. We trust in the output dir to thwart worst-case # so we can not sanitize it here. We trust in the output dir to thwart worst-case
# directory escape attacks. # directory escape attacks.
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
# This is symptomatic of no access to the element, for example, because
# of time availability restrictions.
if "cmdClass=ilInfoScreenGUI" in element.url and "cmd=showSummary" in element.url:
log.explain(
"Skipping element as url points to info screen, "
"this should only happen with not-yet-released elements"
)
return None
if element.type in _VIDEO_ELEMENTS: if element.type in _VIDEO_ELEMENTS:
if not self._videos: if not self._videos:
log.status( log.status(
@ -425,29 +402,17 @@ instance's greatest bottleneck.
) )
return None return None
elif element.type == IliasElementType.COURSE: elif element.type == IliasElementType.COURSE:
if crawl_nested_courses:
return await self._handle_ilias_page(element.url, element, element_path)
log.status( log.status(
"[bold bright_black]", "[bold bright_black]",
"Ignored", "Ignored",
fmt_path(element_path), fmt_path(element_path),
"[bright_black](not descending into linked course)" "[bright_black](not descending into linked course, download it separately)"
)
return None
elif element.type == IliasElementType.WIKI:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](wikis are not currently supported)"
) )
return None return None
elif element.type == IliasElementType.LEARNING_MODULE: elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path) return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path) return await self._handle_link(element, element_path)
elif element.type == IliasElementType.LINK_COLLECTION:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING: elif element.type == IliasElementType.BOOKING:
return await self._handle_booking(element, element_path) return await self._handle_booking(element, element_path)
elif element.type == IliasElementType.OPENCAST_VIDEO: elif element.type == IliasElementType.OPENCAST_VIDEO:
@ -473,98 +438,45 @@ instance's greatest bottleneck.
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}") log.explain(f"Links type is {self._links}")
export_url = url_set_query_param(element.url, "cmd", "exportHTML") link_template_maybe = self._links.template()
resolved = await self._resolve_link_target(export_url) link_extension = self._links.extension()
if resolved == "none": if not link_template_maybe or not link_extension:
links = [LinkData(element.name, "", element.description or "")]
else:
links = self._parse_link_content(element, cast(BeautifulSoup, resolved))
maybe_extension = self._links.extension()
if not maybe_extension:
log.explain("Answer: No") log.explain("Answer: No")
return None return None
else: else:
log.explain("Answer: Yes") log.explain("Answer: Yes")
element_path = element_path.with_name(element_path.name + link_extension)
if len(links) <= 1 or self._links.collection_as_one():
element_path = element_path.with_name(element_path.name + maybe_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime) maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl: if not maybe_dl:
return None return None
return self._download_link(self._links, element.name, links, maybe_dl)
maybe_cl = await self.crawl(element_path) return self._download_link(element, link_template_maybe, maybe_dl)
if not maybe_cl:
return None
# Required for download_all closure
cl = maybe_cl
extension = maybe_extension
async def download_all() -> None:
for link in links:
path = cl.path / (_sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl)
return download_all()
@anoncritical @anoncritical
@_iorepeat(3, "resolving link") @_iorepeat(3, "resolving link")
async def _download_link( async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
self,
link_renderer: Links,
collection_name: str,
links: list[LinkData],
dl: DownloadToken
) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links) export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
sink.file.write(rendered.encode("utf-8")) real_url = await self._resolve_link_target(export_url)
self._write_link_content(link_template, real_url, element.name, element.description, sink)
def _write_link_content(
self,
link_template: str,
url: str,
name: str,
description: Optional[str],
sink: FileSink,
) -> None:
content = link_template
content = content.replace("{{link}}", url)
content = content.replace("{{name}}", name)
content = content.replace("{{description}}", str(description))
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8"))
sink.done() sink.done()
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()) # .select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return "none"
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
@staticmethod
def _parse_link_content(element: IliasPageElement, content: BeautifulSoup) -> list[LinkData]:
links = cast(list[Tag], list(content.select("a")))
if len(links) == 1:
url = str(links[0].get("href")).strip()
return [LinkData(name=element.name, description=element.description or "", url=url)]
results = []
for link in links:
url = str(link.get("href")).strip()
name = link.get_text(strip=True)
description = cast(Tag, link.find_next_sibling("dd")).get_text(strip=True)
results.append(LinkData(name=name, description=description, url=url.strip()))
return results
async def _handle_booking( async def _handle_booking(
self, self,
element: IliasPageElement, element: IliasPageElement,
@ -588,7 +500,7 @@ instance's greatest bottleneck.
self._ensure_not_seen(element, element_path) self._ensure_not_seen(element, element_path)
return self._download_booking(element, maybe_dl) return self._download_booking(element, link_template_maybe, maybe_dl)
@anoncritical @anoncritical
@_iorepeat(1, "downloading description") @_iorepeat(1, "downloading description")
@ -609,13 +521,36 @@ instance's greatest bottleneck.
async def _download_booking( async def _download_booking(
self, self,
element: IliasPageElement, element: IliasPageElement,
link_template: str,
dl: DownloadToken, dl: DownloadToken,
) -> None: ) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
links = [LinkData(name=element.name, description=element.description or "", url=element.url)] self._write_link_content(link_template, element.url, element.name, element.description, sink)
rendered = self._links.interpolate(self._link_file_redirect_delay, element.name, links)
sink.file.write(rendered.encode("utf-8")) async def _resolve_link_target(self, export_url: str) -> str:
sink.done() async def impl() -> Optional[str]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return ""
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
async def _handle_opencast_video( async def _handle_opencast_video(
self, self,
@ -824,23 +759,70 @@ instance's greatest bottleneck.
@_iorepeat(3, "crawling forum") @_iorepeat(3, "crawling forum")
@anoncritical @anoncritical
async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements: List[IliasForumThread] = []
async with cl: async with cl:
inner = IliasPage(await self._get_page(element.url), element) next_stage_url = element.url
export_url = inner.get_forum_export_url() page = None
if not export_url:
log.warn("Could not extract forum export url") while next_stage_url:
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url)
page = IliasPage(soup, element)
if next := page.get_next_stage_element():
next_stage_url = next.url
else:
break
forum_threads: list[tuple[IliasPageElement, bool]] = []
for entry in cast(IliasPage, page).get_forum_entries():
path = cl.path / (_sanitize_path_name(entry.name) + ".html")
forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
# Sort the ids. The forum download will *preserve* this ordering
forum_threads.sort(key=lambda elem: elem[0].id())
if not forum_threads:
log.explain("Forum had no threads")
return return
export = await self._post(export_url, { download_data = cast(IliasPage, page).get_download_forum_data(
"format": "html", [thread.id() for thread, download in forum_threads if download]
"cmd[createExportFile]": "" )
}) if not download_data:
raise CrawlWarning("Failed to extract forum data")
elements = parse_ilias_forum_export(soupify(export)) if not download_data.empty:
html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html))
else:
elements = []
# Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
# download in the correct order, potentially messing up duplication handling.
expected_element_titles = [thread.name for thread, download in forum_threads if download]
actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
if expected_element_titles != actual_element_titles:
raise CrawlWarning(
f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
)
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for thread in elements: for thread, download in forum_threads:
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url))) if download:
# This only works because ILIAS keeps the order in the export
elem = elements.pop(0)
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem, thread)))
else:
# We only downloaded the threads we "should_try_download"ed. This can be an
# over-approximation and all will be fine.
# If we selected too few, e.g. because there was a duplicate title and the mtime of the
# original is newer than the update of the duplicate.
# This causes stale data locally, but I consider this problem acceptable right now.
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, thread)))
# And execute them # And execute them
await self.gather(tasks) await self.gather(tasks)
@ -851,7 +833,7 @@ instance's greatest bottleneck.
self, self,
parent_path: PurePath, parent_path: PurePath,
thread: Union[IliasForumThread, IliasPageElement], thread: Union[IliasForumThread, IliasPageElement],
forum_url: str element: IliasPageElement
) -> None: ) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html") path = parent_path / (_sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime) maybe_dl = await self.download(path, mtime=thread.mtime)
@ -861,7 +843,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
rendered = forum_thread_template( rendered = forum_thread_template(
thread.name, thread.name,
forum_url, element.url,
thread.name_tag, thread.name_tag,
await self.internalize_images(thread.content_tag) await self.internalize_images(thread.content_tag)
) )
@ -1039,19 +1021,29 @@ instance's greatest bottleneck.
) )
return soup return soup
async def _post( async def _post_authenticated(
self, self,
url: str, url: str,
data: dict[str, Union[str, List[str]]] data: dict[str, Union[str, List[str]]]
) -> bytes: ) -> bytes:
auth_id = await self._current_auth_id()
form_data = aiohttp.FormData() form_data = aiohttp.FormData()
for key, val in data.items(): for key, val in data.items():
form_data.add_field(key, val) form_data.add_field(key, val)
async with self.session.post(url, data=form_data()) as request: async with self.session.post(url, data=form_data(), allow_redirects=False) as request:
if request.status == 200: if request.status == 200:
return await request.read() return await request.read()
raise CrawlError(f"post failed with status {request.status}")
# We weren't authenticated, so try to do that
await self.authenticate(auth_id)
# Retry once after authenticating. If this fails, we will die.
async with self.session.post(url, data=data, allow_redirects=False) as request:
if request.status == 200:
return await request.read()
raise CrawlError("post_authenticated failed even after authenticating")
async def _get_authenticated(self, url: str) -> bytes: async def _get_authenticated(self, url: str) -> bytes:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
@ -1081,7 +1073,7 @@ instance's greatest bottleneck.
async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request:
login_page = soupify(await request.read()) login_page = soupify(await request.read())
login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "login_form"})) login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"}))
if login_form is None: if login_form is None:
raise CrawlError("Could not find the login form! Specified client id might be invalid.") raise CrawlError("Could not find the login form! Specified client id might be invalid.")
@ -1091,12 +1083,14 @@ instance's greatest bottleneck.
username, password = await self._auth.credentials() username, password = await self._auth.credentials()
login_form_data = aiohttp.FormData() login_data = {
login_form_data.add_field('login_form/input_3/input_4', username) "username": username,
login_form_data.add_field('login_form/input_3/input_5', password) "password": password,
"cmd[doStandardAuthentication]": "Login",
}
# do the actual login # do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request: async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request:
soup = IliasSoup(soupify(await request.read()), str(request.url)) soup = IliasSoup(soupify(await request.read()), str(request.url))
if not IliasPage.is_logged_in(soup): if not IliasPage.is_logged_in(soup):
self._auth.invalidate_credentials() self._auth.invalidate_credentials()

View File

@ -97,8 +97,7 @@ class IliasElementType(Enum):
BOOKING = "booking" BOOKING = "booking"
COURSE = "course" COURSE = "course"
DCL_RECORD_LIST = "dcl_record_list" DCL_RECORD_LIST = "dcl_record_list"
EXERCISE_OVERVIEW = "exercise_overview" EXERCISE = "exercise"
EXERCISE = "exercise" # own submitted files
EXERCISE_FILES = "exercise_files" # own submitted files EXERCISE_FILES = "exercise_files" # own submitted files
FILE = "file" FILE = "file"
FOLDER = "folder" FOLDER = "folder"
@ -109,7 +108,6 @@ class IliasElementType(Enum):
LEARNING_MODULE_HTML = "learning_module_html" LEARNING_MODULE_HTML = "learning_module_html"
LITERATURE_LIST = "literature_list" LITERATURE_LIST = "literature_list"
LINK = "link" LINK = "link"
LINK_COLLECTION = "link_collection"
MEDIA_POOL = "media_pool" MEDIA_POOL = "media_pool"
MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO = "mediacast_video"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
@ -122,7 +120,6 @@ class IliasElementType(Enum):
SCORM_LEARNING_MODULE = "scorm_learning_module" SCORM_LEARNING_MODULE = "scorm_learning_module"
SURVEY = "survey" SURVEY = "survey"
TEST = "test" # an online test. Will be ignored currently. TEST = "test" # an online test. Will be ignored currently.
WIKI = "wiki"
def matcher(self) -> IliasElementMatcher: def matcher(self) -> IliasElementMatcher:
match self: match self:
@ -143,15 +140,13 @@ class IliasElementType(Enum):
TypeMatcher.query("cmdclass=ildclrecordlistgui") TypeMatcher.query("cmdclass=ildclrecordlistgui")
) )
case IliasElementType.EXERCISE: case IliasElementType.EXERCISE:
return TypeMatcher.never()
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.EXERCISE_OVERVIEW:
return TypeMatcher.any( return TypeMatcher.any(
TypeMatcher.path("/exc/"), TypeMatcher.path("/exc/"),
TypeMatcher.path("_exc_"), TypeMatcher.path("_exc_"),
TypeMatcher.img_src("_exc.svg"), TypeMatcher.img_src("_exc.svg"),
) )
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.FILE: case IliasElementType.FILE:
return TypeMatcher.any( return TypeMatcher.any(
TypeMatcher.query("cmd=sendfile"), TypeMatcher.query("cmd=sendfile"),
@ -203,12 +198,7 @@ class IliasElementType(Enum):
TypeMatcher.query("baseclass=illinkresourcehandlergui"), TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.query("calldirectlink"), TypeMatcher.query("calldirectlink"),
), ),
TypeMatcher.img_src("_webr.svg") # duplicated :( TypeMatcher.img_src("_webr.svg")
)
case IliasElementType.LINK_COLLECTION:
return TypeMatcher.any(
TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.img_src("_webr.svg") # duplicated :(
) )
case IliasElementType.MEDIA_POOL: case IliasElementType.MEDIA_POOL:
return TypeMatcher.any( return TypeMatcher.any(
@ -253,11 +243,6 @@ class IliasElementType(Enum):
TypeMatcher.query("cmdclass=iltestscreengui"), TypeMatcher.query("cmdclass=iltestscreengui"),
TypeMatcher.img_src("_tst.svg") TypeMatcher.img_src("_tst.svg")
) )
case IliasElementType.WIKI:
return TypeMatcher.any(
TypeMatcher.query("baseClass=ilwikihandlergui"),
TypeMatcher.img_src("wiki.svg")
)
raise CrawlWarning(f"Unknown matcher {self}") raise CrawlWarning(f"Unknown matcher {self}")
@ -286,7 +271,6 @@ class IliasPageElement:
r"mcst/(?P<id>\d+)", # mediacast r"mcst/(?P<id>\d+)", # mediacast
r"pg/(?P<id>(\d|_)+)", # page? r"pg/(?P<id>(\d|_)+)", # page?
r"svy/(?P<id>\d+)", # survey r"svy/(?P<id>\d+)", # survey
r"sess/(?P<id>\d+)", # session
r"webr/(?P<id>\d+)", # web referene (link) r"webr/(?P<id>\d+)", # web referene (link)
r"thr_pk=(?P<id>\d+)", # forums r"thr_pk=(?P<id>\d+)", # forums
r"ref_id=(?P<id>\d+)", r"ref_id=(?P<id>\d+)",
@ -505,31 +489,79 @@ class IliasPage:
return url return url
return None return None
def get_forum_export_url(self) -> Optional[str]: def get_forum_entries(self) -> list[IliasPageElement]:
forum_link = self._soup.select_one("#tab_forums_threads > a") form = self._get_forum_form()
if not forum_link: if not form:
log.explain("Found no forum link") return []
return None threads = []
base_url = self._abs_url_from_link(forum_link) for row in cast(list[Tag], form.select("table > tbody > tr")):
base_url = re.sub(r"cmd=\w+", "cmd=post", base_url) url_tag = cast(
base_url = re.sub(r"cmdClass=\w+", "cmdClass=ilExportGUI", base_url)
rtoken_form = cast(
Optional[Tag], Optional[Tag],
self._soup.find("form", attrs={"action": lambda x: x is not None and "rtoken=" in x}) row.find(name="a", attrs={"href": lambda x: x is not None and "cmd=viewthread" in x.lower()})
) )
if not rtoken_form: if url_tag is None:
log.explain("Found no rtoken anywhere") log.explain(f"Skipping row without URL: {row}")
continue
name = url_tag.get_text().strip()
columns = [td.get_text().strip() for td in cast(list[Tag], row.find_all(name="td"))]
potential_dates_opt = [IliasPage._find_date_in_text(column) for column in columns]
potential_dates = [x for x in potential_dates_opt if x is not None]
mtime = max(potential_dates) if potential_dates else None
threads.append(IliasPageElement.create_new(
IliasElementType.FORUM_THREAD,
self._abs_url_from_link(url_tag),
name,
mtime=mtime
))
return threads
def get_download_forum_data(self, thread_ids: list[str]) -> Optional[IliasDownloadForumData]:
form = cast(Optional[Tag], self._soup.find(
"form",
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
))
if not form:
return None return None
match = cast(re.Match[str], re.search(r"rtoken=(\w+)", str(rtoken_form.attrs["action"]))) post_url = self._abs_url_from_relative(cast(str, form["action"]))
rtoken = match.group(1)
base_url = base_url + "&rtoken=" + rtoken log.explain(f"Fetching forum threads {thread_ids}")
return base_url form_data: Dict[str, Union[str, list[str]]] = {
"thread_ids[]": cast(list[str], thread_ids),
"selected_cmd2": "html",
"select_cmd2": "Ausführen",
"selected_cmd": "",
}
return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0)
def _get_forum_form(self) -> Optional[Tag]:
return cast(Optional[Tag], self._soup.find(
"form",
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
))
def get_next_stage_element(self) -> Optional[IliasPageElement]: def get_next_stage_element(self) -> Optional[IliasPageElement]:
if self._is_forum_page():
if "trows=" in self._page_url:
log.explain("Manual row override detected, accepting it as good")
return None
log.explain("Requesting *all* forum threads")
thread_count = self._get_forum_thread_count()
if thread_count is not None and thread_count > 400:
log.warn(
"Forum has more than 400 threads, fetching all threads will take a while. "
"You might need to adjust your http_timeout config option."
)
# Fetch at least 400 in case we detect it wrong
if thread_count is not None and thread_count < 400:
thread_count = 400
return self._get_show_max_forum_entries_per_page_url(thread_count)
if self._is_ilias_opencast_embedding(): if self._is_ilias_opencast_embedding():
log.explain("Unwrapping opencast embedding") log.explain("Unwrapping opencast embedding")
return self.get_child_elements()[0] return self.get_child_elements()[0]
@ -539,8 +571,6 @@ class IliasPage:
if self._contains_collapsed_future_meetings(): if self._contains_collapsed_future_meetings():
log.explain("Requesting *all* future meetings") log.explain("Requesting *all* future meetings")
return self._uncollapse_future_meetings_url() return self._uncollapse_future_meetings_url()
if self._is_exercise_not_all_shown():
return self._show_all_exercises()
if not self._is_content_tab_selected(): if not self._is_content_tab_selected():
if self._page_type != IliasElementType.INFO_TAB: if self._page_type != IliasElementType.INFO_TAB:
log.explain("Selecting content tab") log.explain("Selecting content tab")
@ -549,6 +579,11 @@ class IliasPage:
log.explain("Crawling info tab, skipping content select") log.explain("Crawling info tab, skipping content select")
return None return None
def _is_forum_page(self) -> bool:
if perma_link := self.get_permalink():
return "/frm/" in perma_link
return False
def _is_video_player(self) -> bool: def _is_video_player(self) -> bool:
return "paella_config_file" in str(self._soup) return "paella_config_file" in str(self._soup)
@ -572,7 +607,7 @@ class IliasPage:
def _is_exercise_file(self) -> bool: def _is_exercise_file(self) -> bool:
# we know it from before # we know it from before
if self._page_type == IliasElementType.EXERCISE_OVERVIEW: if self._page_type == IliasElementType.EXERCISE:
return True return True
# We have no suitable parent - let's guesss # We have no suitable parent - let's guesss
@ -609,17 +644,6 @@ class IliasPage:
link = self._abs_url_from_link(element) link = self._abs_url_from_link(element)
return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings") return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings")
def _is_exercise_not_all_shown(self) -> bool:
return (self._page_type == IliasElementType.EXERCISE_OVERVIEW
and "mode=all" not in self._page_url.lower())
def _show_all_exercises(self) -> Optional[IliasPageElement]:
return IliasPageElement.create_new(
IliasElementType.EXERCISE_OVERVIEW,
self._page_url + "&mode=all",
"show all exercises"
)
def _is_content_tab_selected(self) -> bool: def _is_content_tab_selected(self) -> bool:
return self._select_content_page_url() is None return self._select_content_page_url() is None
@ -885,62 +909,15 @@ class IliasPage:
def _find_exercise_entries(self) -> list[IliasPageElement]: def _find_exercise_entries(self) -> list[IliasPageElement]:
if self._soup.find(id="tab_submission"): if self._soup.find(id="tab_submission"):
log.explain("Found submission tab. This is an exercise detail or files page") log.explain("Found submission tab. This is an exercise detail page")
if self._soup.select_one("#tab_submission.active") is None:
log.explain(" This is a details page")
return self._find_exercise_entries_detail_page() return self._find_exercise_entries_detail_page()
else:
log.explain(" This is a files page")
return self._find_exercise_entries_files_page()
log.explain("Found no submission tab. This is an exercise root page") log.explain("Found no submission tab. This is an exercise root page")
return self._find_exercise_entries_root_page() return self._find_exercise_entries_root_page()
def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]: def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = [] results: list[IliasPageElement] = []
if link := cast(Optional[Tag], self._soup.select_one("#tab_submission > a")): # Find all download links in the container (this will contain all the files)
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
self._abs_url_from_link(link),
"Submission"
))
else:
log.explain("Found no submission link for exercise, maybe it has not started yet?")
# Find all download links in the container (this will contain all the *feedback* files)
download_links = cast(list[Tag], self._soup.find_all(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x is not None and "cmd=download" in x},
text="Download"
))
for link in download_links:
parent_row: Tag = cast(Tag, link.find_parent(
attrs={"class": lambda x: x is not None and "row" in x}))
name_tag = cast(Optional[Tag], parent_row.find(name="div"))
if not name_tag:
log.warn("Could not find name tag for exercise entry")
_unexpected_html_warning()
continue
name = _sanitize_path_name(name_tag.get_text().strip())
log.explain(f"Found exercise detail entry {name!r}")
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
self._abs_url_from_link(link),
name
))
return results
def _find_exercise_entries_files_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = []
# Find all download links in the container
download_links = cast(list[Tag], self._soup.find_all( download_links = cast(list[Tag], self._soup.find_all(
name="a", name="a",
# download links contain the given command class # download links contain the given command class
@ -953,7 +930,7 @@ class IliasPage:
children = cast(list[Tag], parent_row.find_all("td")) children = cast(list[Tag], parent_row.find_all("td"))
name = _sanitize_path_name(children[1].get_text().strip()) name = _sanitize_path_name(children[1].get_text().strip())
log.explain(f"Found exercise file entry {name!r}") log.explain(f"Found exercise detail entry {name!r}")
date = None date = None
for child in reversed(children): for child in reversed(children):
@ -961,7 +938,7 @@ class IliasPage:
if date is not None: if date is not None:
break break
if date is None: if date is None:
log.warn(f"Date parsing failed for exercise file entry {name!r}") log.warn(f"Date parsing failed for exercise entry {name!r}")
results.append(IliasPageElement.create_new( results.append(IliasPageElement.create_new(
IliasElementType.FILE, IliasElementType.FILE,
@ -975,32 +952,66 @@ class IliasPage:
def _find_exercise_entries_root_page(self) -> list[IliasPageElement]: def _find_exercise_entries_root_page(self) -> list[IliasPageElement]:
results: list[IliasPageElement] = [] results: list[IliasPageElement] = []
content_tab = cast(Optional[Tag], self._soup.find(id="ilContentContainer")) # Each assignment is in an accordion container
if not content_tab: assignment_containers: list[Tag] = self._soup.select(".il_VAccordionInnerContainer")
log.warn("Could not find content tab in exercise overview page")
_unexpected_html_warning()
return []
exercise_links = content_tab.select(".il-item-title a") for container in assignment_containers:
# Fetch the container name out of the header to use it in the path
container_name = cast(Tag, container.select_one(".ilAssignmentHeader")).get_text().strip()
log.explain(f"Found exercise container {container_name!r}")
for exercise in cast(list[Tag], exercise_links): # Find all download links in the container (this will contain all the files)
if "href" not in exercise.attrs: files = cast(list[Tag], container.find_all(
continue name="a",
href = exercise.attrs["href"] # download links contain the given command class
if type(href) is not str: attrs={"href": lambda x: x is not None and "cmdClass=ilexsubmissiongui" in x},
continue text="Download"
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue
name = _sanitize_path_name(exercise.get_text().strip())
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE,
self._abs_url_from_link(exercise),
name
)) ))
for result in results: # Grab each file as you now have the link
log.explain(f"Found exercise {result.name!r}") for file_link in files:
# Two divs, side by side. Left is the name, right is the link ==> get left
# sibling
file_name = cast(
Tag,
cast(Tag, file_link.parent).find_previous(name="div")
).get_text().strip()
url = self._abs_url_from_link(file_link)
log.explain(f"Found exercise entry {file_name!r}")
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
url,
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
mtime=None, # We do not have any timestamp
skip_sanitize=True
))
# Find all links to file listings (e.g. "Submitted Files" for groups)
file_listings = cast(list[Tag], container.find_all(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x is not None and "cmdclass=ilexsubmissionfilegui" in x.lower()}
))
# Add each listing as a new
for listing in file_listings:
parent_container = cast(Tag, listing.find_parent(
"div", attrs={"class": lambda x: x is not None and "form-group" in x}
))
label_container = cast(Tag, parent_container.find(
attrs={"class": lambda x: x is not None and "control-label" in x}
))
file_name = label_container.get_text().strip()
url = self._abs_url_from_link(listing)
log.explain(f"Found exercise detail {file_name!r} at {url}")
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
url,
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
None, # we do not have any timestamp
skip_sanitize=True
))
return results return results
@ -1116,7 +1127,7 @@ class IliasPage:
videos.append(IliasPageElement.create_new( videos.append(IliasPageElement.create_new(
typ=IliasElementType.MOB_VIDEO, typ=IliasElementType.MOB_VIDEO,
url=url, url=self._abs_url_from_relative(url),
name=_sanitize_path_name(title), name=_sanitize_path_name(title),
mtime=None mtime=None
)) ))
@ -1142,9 +1153,6 @@ class IliasPage:
else: else:
title = f"unknown video {figure}" title = f"unknown video {figure}"
if url:
url = self._abs_url_from_relative(url)
return url, title return url, title
def _is_in_expanded_meeting(self, tag: Tag) -> bool: def _is_in_expanded_meeting(self, tag: Tag) -> bool:

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.8.3" VERSION = "3.8.0"