Compare commits

...

7 Commits

Author SHA1 Message Date
465f8b28c0 Bump version to 3.8.3 2025-07-01 14:28:30 +02:00
27e69af2f3 Update changelog for 8caad00 2025-07-01 14:26:10 +02:00
56e3065950 Document usage of pilot.ilias.studium.kit.edu (#111) 2025-05-30 17:13:45 +02:00
549ce6cce9 Ignore unavailable elements (#119) 2025-05-28 17:04:57 +02:00
34564cedb4 Add support for link collections 2025-05-27 16:25:59 +02:00
2b0d20a1f6 Fix crawling of exercises with instructions
We do not want a second path and the instruction field has an identical
link...
2025-05-26 14:42:38 +02:00
8caad0008d Fix check for nonexistent ilias_url command attribute to base_url (#113) 2025-05-05 22:05:54 +02:00
7 changed files with 219 additions and 90 deletions

View File

@ -22,6 +22,21 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.8.3 - 2025-07-01
## Added
- Support for link collections.
In "fancy" mode, a single HTML file with multiple links is generated.
In all other modes, PFERD creates a folder for the collection and a new file
for every link inside.
## Fixed
- Crawling of exercises with instructions
- Don't download unavailable elements.
Elements that are unavailable (for example, because their availability is
time restricted) will not download the HTML for the info page anymore.
- `base_url` argument for `ilias-web` crawler causing crashes
## 3.8.2 - 2025-04-29 ## 3.8.2 - 2025-04-29
## Changed ## Changed

View File

@ -164,12 +164,13 @@ out of the box for the corresponding universities:
[ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs"
| University | `base_url` | `login_type` | `client_id` | | University | `base_url` | `login_type` | `client_id` |
|---------------|-----------------------------------------|--------------|---------------| |-----------------|-----------------------------------------|--------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | local | elearning | | FH Aachen | https://www.ili.fh-aachen.de | local | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk | | Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ | | Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart | | Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart |
| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | | | Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | |
| KIT ILIAS Pilot | https://pilot.ilias.studium.kit.edu | shibboleth | pilot |
If your university isn't listed, try navigating to your instance's login page. If your university isn't listed, try navigating to your instance's login page.
Assuming no custom login service is used, the URL will look something like this: Assuming no custom login service is used, the URL will look something like this:

View File

@ -45,8 +45,8 @@ def load(
load_crawler(args, section) load_crawler(args, section)
section["type"] = COMMAND_NAME section["type"] = COMMAND_NAME
if args.ilias_url is not None: if args.base_url is not None:
section["base_url"] = args.ilias_url section["base_url"] = args.base_url
if args.client_id is not None: if args.client_id is not None:
section["client_id"] = args.client_id section["client_id"] = args.client_id

View File

@ -1,3 +1,5 @@
import dataclasses
import re
from enum import Enum from enum import Enum
from typing import Optional, cast from typing import Optional, cast
@ -12,7 +14,9 @@ _link_template_fancy = """
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>ILIAS - Link: {{name}}</title> <title>ILIAS - Link: {{name}}</title>
<!-- REPEAT REMOVE START -->
<meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" /> <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" />
<!-- REPEAT REMOVE END -->
</head> </head>
<style> <style>
@ -23,6 +27,8 @@ _link_template_fancy = """
display: flex; display: flex;
align-items: center; align-items: center;
justify-content: center; justify-content: center;
flex-direction: column;
gap: 4px;
} }
body { body {
padding: 0; padding: 0;
@ -31,11 +37,16 @@ _link_template_fancy = """
font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif; font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif;
height: 100vh; height: 100vh;
} }
.row { .column {
background-color: white;
min-width: 500px; min-width: 500px;
max-width: 90vw; max-width: 90vw;
display: flex; display: flex;
flex-direction: column;
row-gap: 5px;
}
.row {
background-color: white;
display: flex;
padding: 1em; padding: 1em;
} }
.logo { .logo {
@ -75,6 +86,8 @@ _link_template_fancy = """
} }
</style> </style>
<body class="center-flex"> <body class="center-flex">
<div class="column">
<!-- REPEAT START -->
<div class="row"> <div class="row">
<div class="logo center-flex"> <div class="logo center-flex">
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"> <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
@ -89,6 +102,8 @@ _link_template_fancy = """
</div> </div>
<div class="menu-button center-flex"> ⯆ </div> <div class="menu-button center-flex"> ⯆ </div>
</div> </div>
<!-- REPEAT END -->
</div>
</body> </body>
</html> </html>
""".strip() # noqa: E501 line too long """.strip() # noqa: E501 line too long
@ -255,6 +270,13 @@ def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Ta
.replace("{{content}}", cast(str, content.prettify())) .replace("{{content}}", cast(str, content.prettify()))
@dataclasses.dataclass
class LinkData:
name: str
url: str
description: str
class Links(Enum): class Links(Enum):
IGNORE = "ignore" IGNORE = "ignore"
PLAINTEXT = "plaintext" PLAINTEXT = "plaintext"
@ -272,6 +294,11 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def collection_as_one(self) -> bool:
if self == Links.FANCY:
return True
return False
def extension(self) -> Optional[str]: def extension(self) -> Optional[str]:
if self == Links.FANCY: if self == Links.FANCY:
return ".html" return ".html"
@ -283,10 +310,48 @@ class Links(Enum):
return None return None
raise ValueError("Missing switch case") raise ValueError("Missing switch case")
def interpolate(self, redirect_delay: int, collection_name: str, links: list[LinkData]) -> str:
template = self.template()
if template is None:
raise ValueError("Cannot interpolate ignored links")
if len(links) == 1:
link = links[0]
content = template
content = content.replace("{{link}}", link.url)
content = content.replace("{{name}}", link.name)
content = content.replace("{{description}}", link.description)
content = content.replace("{{redirect_delay}}", str(redirect_delay))
return content
if self == Links.PLAINTEXT or self == Links.INTERNET_SHORTCUT:
return "\n".join(f"{link.url}" for link in links)
# All others get coerced to fancy
content = cast(str, Links.FANCY.template())
repeated_content = cast(
re.Match[str],
re.search(r"<!-- REPEAT START -->([\s\S]+)<!-- REPEAT END -->", content)
).group(1)
parts = []
for link in links:
instance = repeated_content
instance = instance.replace("{{link}}", link.url)
instance = instance.replace("{{name}}", link.name)
instance = instance.replace("{{description}}", link.description)
instance = instance.replace("{{redirect_delay}}", str(redirect_delay))
parts.append(instance)
content = content.replace(repeated_content, "\n".join(parts))
content = content.replace("{{name}}", collection_name)
content = re.sub(r"<!-- REPEAT REMOVE START -->[\s\S]+<!-- REPEAT REMOVE END -->", "", content)
return content
@staticmethod @staticmethod
def from_string(string: str) -> "Links": def from_string(string: str) -> "Links":
try: try:
return Links(string) return Links(string)
except ValueError: except ValueError:
raise ValueError("must be one of 'ignore', 'plaintext'," options = [f"'{option.value}'" for option in Links]
" 'html', 'internet-shortcut'") raise ValueError(f"must be one of {', '.join(options)}")

View File

@ -19,7 +19,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat from .async_helper import _iorepeat
from .file_templates import Links, forum_thread_template, learning_module_template from .file_templates import LinkData, Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export) IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
@ -329,6 +329,15 @@ instance's greatest bottleneck.
# directory escape attacks. # directory escape attacks.
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
# This is symptomatic of no access to the element, for example, because
# of time availability restrictions.
if "cmdClass=ilInfoScreenGUI" in element.url and "cmd=showSummary" in element.url:
log.explain(
"Skipping element as url points to info screen, "
"this should only happen with not-yet-released elements"
)
return None
if element.type in _VIDEO_ELEMENTS: if element.type in _VIDEO_ELEMENTS:
if not self._videos: if not self._videos:
log.status( log.status(
@ -437,6 +446,8 @@ instance's greatest bottleneck.
return await self._handle_learning_module(element, element_path) return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path) return await self._handle_link(element, element_path)
elif element.type == IliasElementType.LINK_COLLECTION:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING: elif element.type == IliasElementType.BOOKING:
return await self._handle_booking(element, element_path) return await self._handle_booking(element, element_path)
elif element.type == IliasElementType.OPENCAST_VIDEO: elif element.type == IliasElementType.OPENCAST_VIDEO:
@ -462,45 +473,98 @@ instance's greatest bottleneck.
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}") log.explain(f"Links type is {self._links}")
link_template_maybe = self._links.template() export_url = url_set_query_param(element.url, "cmd", "exportHTML")
link_extension = self._links.extension() resolved = await self._resolve_link_target(export_url)
if not link_template_maybe or not link_extension: if resolved == "none":
links = [LinkData(element.name, "", element.description or "")]
else:
links = self._parse_link_content(element, cast(BeautifulSoup, resolved))
maybe_extension = self._links.extension()
if not maybe_extension:
log.explain("Answer: No") log.explain("Answer: No")
return None return None
else: else:
log.explain("Answer: Yes") log.explain("Answer: Yes")
element_path = element_path.with_name(element_path.name + link_extension)
if len(links) <= 1 or self._links.collection_as_one():
element_path = element_path.with_name(element_path.name + maybe_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime) maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl: if not maybe_dl:
return None return None
return self._download_link(self._links, element.name, links, maybe_dl)
return self._download_link(element, link_template_maybe, maybe_dl) maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None
# Required for download_all closure
cl = maybe_cl
extension = maybe_extension
async def download_all() -> None:
for link in links:
path = cl.path / (_sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl)
return download_all()
@anoncritical @anoncritical
@_iorepeat(3, "resolving link") @_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async def _download_link(
async with dl as (bar, sink):
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url)
self._write_link_content(link_template, real_url, element.name, element.description, sink)
def _write_link_content(
self, self,
link_template: str, link_renderer: Links,
url: str, collection_name: str,
name: str, links: list[LinkData],
description: Optional[str], dl: DownloadToken
sink: FileSink,
) -> None: ) -> None:
content = link_template async with dl as (bar, sink):
content = content.replace("{{link}}", url) rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links)
content = content.replace("{{name}}", name) sink.file.write(rendered.encode("utf-8"))
content = content.replace("{{description}}", str(description))
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8"))
sink.done() sink.done()
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()) # .select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return "none"
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
@staticmethod
def _parse_link_content(element: IliasPageElement, content: BeautifulSoup) -> list[LinkData]:
links = cast(list[Tag], list(content.select("a")))
if len(links) == 1:
url = str(links[0].get("href")).strip()
return [LinkData(name=element.name, description=element.description or "", url=url)]
results = []
for link in links:
url = str(link.get("href")).strip()
name = link.get_text(strip=True)
description = cast(Tag, link.find_next_sibling("dd")).get_text(strip=True)
results.append(LinkData(name=name, description=description, url=url.strip()))
return results
async def _handle_booking( async def _handle_booking(
self, self,
element: IliasPageElement, element: IliasPageElement,
@ -524,7 +588,7 @@ instance's greatest bottleneck.
self._ensure_not_seen(element, element_path) self._ensure_not_seen(element, element_path)
return self._download_booking(element, link_template_maybe, maybe_dl) return self._download_booking(element, maybe_dl)
@anoncritical @anoncritical
@_iorepeat(1, "downloading description") @_iorepeat(1, "downloading description")
@ -545,36 +609,13 @@ instance's greatest bottleneck.
async def _download_booking( async def _download_booking(
self, self,
element: IliasPageElement, element: IliasPageElement,
link_template: str,
dl: DownloadToken, dl: DownloadToken,
) -> None: ) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
self._write_link_content(link_template, element.url, element.name, element.description, sink) links = [LinkData(name=element.name, description=element.description or "", url=element.url)]
rendered = self._links.interpolate(self._link_file_redirect_delay, element.name, links)
async def _resolve_link_target(self, export_url: str) -> str: sink.file.write(rendered.encode("utf-8"))
async def impl() -> Optional[str]: sink.done()
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return ""
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
async def _handle_opencast_video( async def _handle_opencast_video(
self, self,

View File

@ -109,6 +109,7 @@ class IliasElementType(Enum):
LEARNING_MODULE_HTML = "learning_module_html" LEARNING_MODULE_HTML = "learning_module_html"
LITERATURE_LIST = "literature_list" LITERATURE_LIST = "literature_list"
LINK = "link" LINK = "link"
LINK_COLLECTION = "link_collection"
MEDIA_POOL = "media_pool" MEDIA_POOL = "media_pool"
MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO = "mediacast_video"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
@ -202,7 +203,12 @@ class IliasElementType(Enum):
TypeMatcher.query("baseclass=illinkresourcehandlergui"), TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.query("calldirectlink"), TypeMatcher.query("calldirectlink"),
), ),
TypeMatcher.img_src("_webr.svg") TypeMatcher.img_src("_webr.svg") # duplicated :(
)
case IliasElementType.LINK_COLLECTION:
return TypeMatcher.any(
TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.img_src("_webr.svg") # duplicated :(
) )
case IliasElementType.MEDIA_POOL: case IliasElementType.MEDIA_POOL:
return TypeMatcher.any( return TypeMatcher.any(
@ -975,16 +981,17 @@ class IliasPage:
_unexpected_html_warning() _unexpected_html_warning()
return [] return []
individual_exercises = content_tab.find_all( exercise_links = content_tab.select(".il-item-title a")
name="a",
attrs={ for exercise in cast(list[Tag], exercise_links):
"href": lambda x: x is not None if "href" not in exercise.attrs:
and "ass_id=" in x continue
and "cmdClass=ilAssignmentPresentationGUI" in x href = exercise.attrs["href"]
} if type(href) is not str:
) continue
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue
for exercise in cast(list[Tag], individual_exercises):
name = _sanitize_path_name(exercise.get_text().strip()) name = _sanitize_path_name(exercise.get_text().strip())
results.append(IliasPageElement.create_new( results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE, IliasElementType.EXERCISE,

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.8.2" VERSION = "3.8.3"