Compare commits

...

16 Commits

16 changed files with 929 additions and 612 deletions

View File

@ -14,7 +14,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
python: ["3.9"]
python: ["3.11"]
steps:
- uses: actions/checkout@v4

View File

@ -22,8 +22,46 @@ ambiguous situations.
## Unreleased
## 3.8.3 - 2025-07-01
## Added
- Support for link collections.
In "fancy" mode, a single HTML file with multiple links is generated.
In all other modes, PFERD creates a folder for the collection and a new file
for every link inside.
## Fixed
- Crawling of exercises with instructions
- Don't download unavailable elements.
Elements that are unavailable (for example, because their availability is
time restricted) will not download the HTML for the info page anymore.
- `base_url` argument for `ilias-web` crawler causing crashes
## 3.8.2 - 2025-04-29
## Changed
- Explicitly mention that wikis are not supported at the moment and ignore them
## Fixed
- Ilias-native login
- Exercise crawling
## 3.8.1 - 2025-04-17
## Fixed
- Description html files now specify at UTF-8 encoding
- Images in descriptions now always have a white background
## 3.8.0 - 2025-04-16
### Added
- Support for ILIAS 9
### Changed
- Added prettier CSS to forum threads
- Downloaded forum threads now link to the forum instead of the ILIAS thread
- Increase minimum supported Python version to 3.11
- Do not crawl nested courses (courses linked in other courses)
## Fixed
- File links in report on Windows

View File

@ -163,13 +163,14 @@ out of the box for the corresponding universities:
[ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs"
| University | `base_url` | `login_type` | `client_id` |
|---------------|-----------------------------------------|--------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | local | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart |
| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | |
| University | `base_url` | `login_type` | `client_id` |
|-----------------|-----------------------------------------|--------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | local | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart |
| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | |
| KIT ILIAS Pilot | https://pilot.ilias.studium.kit.edu | shibboleth | pilot |
If your university isn't listed, try navigating to your instance's login page.
Assuming no custom login service is used, the URL will look something like this:

View File

@ -45,8 +45,8 @@ def load(
load_crawler(args, section)
section["type"] = COMMAND_NAME
if args.ilias_url is not None:
section["base_url"] = args.ilias_url
if args.base_url is not None:
section["base_url"] = args.base_url
if args.client_id is not None:
section["client_id"] = args.client_id

View File

@ -149,9 +149,7 @@ class CrawlerSection(Section):
return self.s.getboolean("skip", fallback=False)
def output_dir(self, name: str) -> Path:
# TODO Use removeprefix() after switching to 3.9
if name.startswith("crawl:"):
name = name[len("crawl:"):]
name = name.removeprefix("crawl:")
return Path(self.s.get("output_dir", name)).expanduser()
def redownload(self) -> Redownload:

View File

@ -1,3 +1,5 @@
import dataclasses
import re
from enum import Enum
from typing import Optional, cast
@ -12,7 +14,9 @@ _link_template_fancy = """
<head>
<meta charset="UTF-8">
<title>ILIAS - Link: {{name}}</title>
<!-- REPEAT REMOVE START -->
<meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" />
<!-- REPEAT REMOVE END -->
</head>
<style>
@ -23,6 +27,8 @@ _link_template_fancy = """
display: flex;
align-items: center;
justify-content: center;
flex-direction: column;
gap: 4px;
}
body {
padding: 0;
@ -31,11 +37,16 @@ _link_template_fancy = """
font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif;
height: 100vh;
}
.row {
background-color: white;
.column {
min-width: 500px;
max-width: 90vw;
display: flex;
flex-direction: column;
row-gap: 5px;
}
.row {
background-color: white;
display: flex;
padding: 1em;
}
.logo {
@ -75,19 +86,23 @@ _link_template_fancy = """
}
</style>
<body class="center-flex">
<div class="row">
<div class="logo center-flex">
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
<path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/>
</svg>
</div>
<div class="tile">
<div class="top-row">
<a href="{{link}}">{{name}}</a>
<div class="column">
<!-- REPEAT START -->
<div class="row">
<div class="logo center-flex">
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
<path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/>
</svg>
</div>
<div class="bottom-row">{{description}}</div>
<div class="tile">
<div class="top-row">
<a href="{{link}}">{{name}}</a>
</div>
<div class="bottom-row">{{description}}</div>
</div>
<div class="menu-button center-flex"> ⯆ </div>
</div>
<div class="menu-button center-flex"> ⯆ </div>
<!-- REPEAT END -->
</div>
</body>
</html>
@ -255,6 +270,13 @@ def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Ta
.replace("{{content}}", cast(str, content.prettify()))
@dataclasses.dataclass
class LinkData:
name: str
url: str
description: str
class Links(Enum):
IGNORE = "ignore"
PLAINTEXT = "plaintext"
@ -272,6 +294,11 @@ class Links(Enum):
return None
raise ValueError("Missing switch case")
def collection_as_one(self) -> bool:
if self == Links.FANCY:
return True
return False
def extension(self) -> Optional[str]:
if self == Links.FANCY:
return ".html"
@ -283,10 +310,48 @@ class Links(Enum):
return None
raise ValueError("Missing switch case")
def interpolate(self, redirect_delay: int, collection_name: str, links: list[LinkData]) -> str:
template = self.template()
if template is None:
raise ValueError("Cannot interpolate ignored links")
if len(links) == 1:
link = links[0]
content = template
content = content.replace("{{link}}", link.url)
content = content.replace("{{name}}", link.name)
content = content.replace("{{description}}", link.description)
content = content.replace("{{redirect_delay}}", str(redirect_delay))
return content
if self == Links.PLAINTEXT or self == Links.INTERNET_SHORTCUT:
return "\n".join(f"{link.url}" for link in links)
# All others get coerced to fancy
content = cast(str, Links.FANCY.template())
repeated_content = cast(
re.Match[str],
re.search(r"<!-- REPEAT START -->([\s\S]+)<!-- REPEAT END -->", content)
).group(1)
parts = []
for link in links:
instance = repeated_content
instance = instance.replace("{{link}}", link.url)
instance = instance.replace("{{name}}", link.name)
instance = instance.replace("{{description}}", link.description)
instance = instance.replace("{{redirect_delay}}", str(redirect_delay))
parts.append(instance)
content = content.replace(repeated_content, "\n".join(parts))
content = content.replace("{{name}}", collection_name)
content = re.sub(r"<!-- REPEAT REMOVE START -->[\s\S]+<!-- REPEAT REMOVE END -->", "", content)
return content
@staticmethod
def from_string(string: str) -> "Links":
try:
return Links(string)
except ValueError:
raise ValueError("must be one of 'ignore', 'plaintext',"
" 'html', 'internet-shortcut'")
options = [f"'{option.value}'" for option in Links]
raise ValueError(f"must be one of {', '.join(options)}")

View File

@ -39,6 +39,10 @@ _STYLE_TAG_CONTENT = """
margin: 0.5rem 0;
}
img {
background-color: white;
}
body {
padding: 1em;
grid-template-columns: 1fr min(60rem, 90%) 1fr;
@ -56,12 +60,11 @@ _ARTICLE_WORTHY_CLASSES = [
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
head = soup.new_tag("head")
soup.insert(0, head)
# Force UTF-8 encoding
head.append(soup.new_tag("meta", charset="utf-8"))
simplecss_link: Tag = soup.new_tag("link")
# <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
simplecss_link["rel"] = "stylesheet"
simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
head.append(simplecss_link)
head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css"))
# Basic style tags for compat
style: Tag = soup.new_tag("style")

View File

@ -19,10 +19,10 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat
from .file_templates import Links, forum_thread_template, learning_module_template
from .file_templates import LinkData, Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
from .shibboleth_login import ShibbolethLogin
TargetType = Union[str, int]
@ -105,9 +105,9 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.COURSE,
IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES,
IliasElementType.EXERCISE_OVERVIEW,
IliasElementType.FOLDER,
IliasElementType.INFO_TAB,
IliasElementType.MEDIACAST_VIDEO_FOLDER,
@ -217,11 +217,19 @@ instance's greatest bottleneck.
async def _crawl_desktop(self) -> None:
await self._crawl_url(
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show")
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"),
crawl_nested_courses=True
)
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
if awaitable := await self._handle_ilias_page(url, None, PurePath("."), expected_id):
async def _crawl_url(
self,
url: str,
expected_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> None:
if awaitable := await self._handle_ilias_page(
url, None, PurePath("."), expected_id, crawl_nested_courses
):
await awaitable
async def _handle_ilias_page(
@ -230,6 +238,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement],
path: PurePath,
expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path)
if not maybe_cl:
@ -237,7 +246,9 @@ instance's greatest bottleneck.
if current_element:
self._ensure_not_seen(current_element, path)
return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id)
return self._crawl_ilias_page(
url, current_element, maybe_cl, expected_course_id, crawl_nested_courses
)
@anoncritical
async def _crawl_ilias_page(
@ -246,6 +257,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement],
cl: CrawlToken,
expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False,
) -> None:
elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes
@ -267,12 +279,12 @@ instance's greatest bottleneck.
# If we expect to find a root course, enforce it
if current_parent is None and expected_course_id is not None:
perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link or "crs_" not in perma_link:
if not perma_link or "crs/" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
if str(expected_course_id) not in perma_link:
raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}")
page = IliasPage(soup, next_stage_url, current_parent)
page = IliasPage(soup, current_parent)
if next_element := page.get_next_stage_element():
current_parent = next_element
next_stage_url = next_element.url
@ -294,7 +306,7 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = []
for element in elements:
if handle := await self._handle_ilias_element(cl.path, element):
if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses):
tasks.append(asyncio.create_task(handle))
# And execute them
@ -310,12 +322,22 @@ instance's greatest bottleneck.
self,
parent_path: PurePath,
element: IliasPageElement,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]:
# element.name might contain `/` if the crawler created nested elements,
# so we can not sanitize it here. We trust in the output dir to thwart worst-case
# directory escape attacks.
element_path = PurePath(parent_path, element.name)
# This is symptomatic of no access to the element, for example, because
# of time availability restrictions.
if "cmdClass=ilInfoScreenGUI" in element.url and "cmd=showSummary" in element.url:
log.explain(
"Skipping element as url points to info screen, "
"this should only happen with not-yet-released elements"
)
return None
if element.type in _VIDEO_ELEMENTS:
if not self._videos:
log.status(
@ -362,10 +384,70 @@ instance's greatest bottleneck.
"[bright_black](scorm learning modules are not supported)"
)
return None
elif element.type == IliasElementType.LITERATURE_LIST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](literature lists are not currently supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE_HTML:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](HTML learning modules are not supported)"
)
return None
elif element.type == IliasElementType.BLOG:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](blogs are not currently supported)"
)
return None
elif element.type == IliasElementType.DCL_RECORD_LIST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](dcl record lists are not currently supported)"
)
return None
elif element.type == IliasElementType.MEDIA_POOL:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](media pools are not currently supported)"
)
return None
elif element.type == IliasElementType.COURSE:
if crawl_nested_courses:
return await self._handle_ilias_page(element.url, element, element_path)
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](not descending into linked course)"
)
return None
elif element.type == IliasElementType.WIKI:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](wikis are not currently supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.LINK_COLLECTION:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING:
return await self._handle_booking(element, element_path)
elif element.type == IliasElementType.OPENCAST_VIDEO:
@ -391,44 +473,97 @@ instance's greatest bottleneck.
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}")
link_template_maybe = self._links.template()
link_extension = self._links.extension()
if not link_template_maybe or not link_extension:
export_url = url_set_query_param(element.url, "cmd", "exportHTML")
resolved = await self._resolve_link_target(export_url)
if resolved == "none":
links = [LinkData(element.name, "", element.description or "")]
else:
links = self._parse_link_content(element, cast(BeautifulSoup, resolved))
maybe_extension = self._links.extension()
if not maybe_extension:
log.explain("Answer: No")
return None
else:
log.explain("Answer: Yes")
element_path = element_path.with_name(element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
if len(links) <= 1 or self._links.collection_as_one():
element_path = element_path.with_name(element_path.name + maybe_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
return None
return self._download_link(self._links, element.name, links, maybe_dl)
maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None
# Required for download_all closure
cl = maybe_cl
extension = maybe_extension
return self._download_link(element, link_template_maybe, maybe_dl)
async def download_all() -> None:
for link in links:
path = cl.path / (_sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl)
return download_all()
@anoncritical
@_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
async with dl as (bar, sink):
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url)
self._write_link_content(link_template, real_url, element.name, element.description, sink)
def _write_link_content(
async def _download_link(
self,
link_template: str,
url: str,
name: str,
description: Optional[str],
sink: FileSink,
link_renderer: Links,
collection_name: str,
links: list[LinkData],
dl: DownloadToken
) -> None:
content = link_template
content = content.replace("{{link}}", url)
content = content.replace("{{name}}", name)
content = content.replace("{{description}}", str(description))
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8"))
sink.done()
async with dl as (bar, sink):
rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links)
sink.file.write(rendered.encode("utf-8"))
sink.done()
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()) # .select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return "none"
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
@staticmethod
def _parse_link_content(element: IliasPageElement, content: BeautifulSoup) -> list[LinkData]:
links = cast(list[Tag], list(content.select("a")))
if len(links) == 1:
url = str(links[0].get("href")).strip()
return [LinkData(name=element.name, description=element.description or "", url=url)]
results = []
for link in links:
url = str(link.get("href")).strip()
name = link.get_text(strip=True)
description = cast(Tag, link.find_next_sibling("dd")).get_text(strip=True)
results.append(LinkData(name=name, description=description, url=url.strip()))
return results
async def _handle_booking(
self,
@ -453,7 +588,7 @@ instance's greatest bottleneck.
self._ensure_not_seen(element, element_path)
return self._download_booking(element, link_template_maybe, maybe_dl)
return self._download_booking(element, maybe_dl)
@anoncritical
@_iorepeat(1, "downloading description")
@ -474,36 +609,13 @@ instance's greatest bottleneck.
async def _download_booking(
self,
element: IliasPageElement,
link_template: str,
dl: DownloadToken,
) -> None:
async with dl as (bar, sink):
self._write_link_content(link_template, element.url, element.name, element.description, sink)
async def _resolve_link_target(self, export_url: str) -> str:
async def impl() -> Optional[str]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return ""
return None
auth_id = await self._current_auth_id()
target = await impl()
if target is not None:
return target
await self.authenticate(auth_id)
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")
links = [LinkData(name=element.name, description=element.description or "", url=element.url)]
rendered = self._links.interpolate(self._link_file_redirect_delay, element.name, links)
sink.file.write(rendered.encode("utf-8"))
sink.done()
async def _handle_opencast_video(
self,
@ -590,7 +702,7 @@ instance's greatest bottleneck.
)
async with dl as (bar, sink):
page = IliasPage(await self._get_page(element.url), element.url, element)
page = IliasPage(await self._get_page(element.url), element)
stream_elements = page.get_child_elements()
if len(stream_elements) > 1:
@ -600,7 +712,7 @@ instance's greatest bottleneck.
stream_element = stream_elements[0]
# We do not have a local cache yet
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
await self._stream_from_url(stream_element, sink, bar, is_video=True)
add_to_report([str(self._transformer.transform(dl.path))])
return
@ -615,7 +727,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink):
log.explain(f"Streaming video from real url {stream_element.url}")
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
await self._stream_from_url(stream_element, sink, bar, is_video=True)
add_to_report(contained_video_paths)
@ -637,12 +749,19 @@ instance's greatest bottleneck.
async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None:
assert dl # The function is only reached when dl is not None
async with dl as (bar, sink):
await self._stream_from_url(element.url, sink, bar, is_video)
await self._stream_from_url(element, sink, bar, is_video)
async def _stream_from_url(
self,
element: IliasPageElement,
sink: FileSink,
bar: ProgressBar,
is_video: bool
) -> None:
url = element.url
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool:
next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
@ -690,7 +809,7 @@ instance's greatest bottleneck.
await self.authenticate(auth_id)
if not await try_stream():
raise CrawlError("File streaming failed after authenticate()")
raise CrawlError(f"File streaming failed after authenticate() {element!r}")
async def _handle_forum(
self,
@ -705,70 +824,23 @@ instance's greatest bottleneck.
@_iorepeat(3, "crawling forum")
@anoncritical
async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements: List[IliasForumThread] = []
async with cl:
next_stage_url = element.url
page = None
while next_stage_url:
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url)
page = IliasPage(soup, next_stage_url, element)
if next := page.get_next_stage_element():
next_stage_url = next.url
else:
break
forum_threads: list[tuple[IliasPageElement, bool]] = []
for entry in cast(IliasPage, page).get_forum_entries():
path = cl.path / (_sanitize_path_name(entry.name) + ".html")
forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
# Sort the ids. The forum download will *preserve* this ordering
forum_threads.sort(key=lambda elem: elem[0].id())
if not forum_threads:
log.explain("Forum had no threads")
inner = IliasPage(await self._get_page(element.url), element)
export_url = inner.get_forum_export_url()
if not export_url:
log.warn("Could not extract forum export url")
return
download_data = cast(IliasPage, page).get_download_forum_data(
[thread.id() for thread, download in forum_threads if download]
)
if not download_data:
raise CrawlWarning("Failed to extract forum data")
export = await self._post(export_url, {
"format": "html",
"cmd[createExportFile]": ""
})
if not download_data.empty:
html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html))
else:
elements = []
# Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
# download in the correct order, potentially messing up duplication handling.
expected_element_titles = [thread.name for thread, download in forum_threads if download]
actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
if expected_element_titles != actual_element_titles:
raise CrawlWarning(
f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
)
elements = parse_ilias_forum_export(soupify(export))
tasks: List[Awaitable[None]] = []
for thread, download in forum_threads:
if download:
# This only works because ILIAS keeps the order in the export
elem = elements.pop(0)
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem, thread)))
else:
# We only downloaded the threads we "should_try_download"ed. This can be an
# over-approximation and all will be fine.
# If we selected too few, e.g. because there was a duplicate title and the mtime of the
# original is newer than the update of the duplicate.
# This causes stale data locally, but I consider this problem acceptable right now.
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, thread)))
for thread in elements:
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url)))
# And execute them
await self.gather(tasks)
@ -779,7 +851,7 @@ instance's greatest bottleneck.
self,
parent_path: PurePath,
thread: Union[IliasForumThread, IliasPageElement],
element: IliasPageElement
forum_url: str
) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime)
@ -789,7 +861,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink):
rendered = forum_thread_template(
thread.name,
element.url,
forum_url,
thread.name_tag,
await self.internalize_images(thread.content_tag)
)
@ -817,7 +889,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {element.url}")
soup = await self._get_page(element.url)
page = IliasPage(soup, element.url, element)
page = IliasPage(soup, element)
if next := page.get_learning_module_data():
elements.extend(await self._crawl_learning_module_direction(
cl.path, next.previous_url, "left", element
@ -860,7 +932,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
log.explain(f"URL: {next_element_url}")
soup = await self._get_page(next_element_url)
page = IliasPage(soup, next_element_url, parent_element)
page = IliasPage(soup, parent_element)
if next := page.get_learning_module_data():
elements.append(next)
if dir == "left":
@ -891,13 +963,13 @@ instance's greatest bottleneck.
if prev:
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
if prev_p:
prev = os.path.relpath(prev_p, my_path.parent)
prev = cast(str, os.path.relpath(prev_p, my_path.parent))
else:
prev = None
if next:
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
if next_p:
next = os.path.relpath(next_p, my_path.parent)
next = cast(str, os.path.relpath(next_p, my_path.parent))
else:
next = None
@ -937,10 +1009,10 @@ instance's greatest bottleneck.
)
self._visited_urls[element.url] = parent_path
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
async def _get_page(self, url: str, root_page_allowed: bool = False) -> IliasSoup:
auth_id = await self._current_auth_id()
async with self.session.get(url) as request:
soup = soupify(await request.read())
soup = IliasSoup(soupify(await request.read()), str(request.url))
if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed)
@ -949,13 +1021,13 @@ instance's greatest bottleneck.
# Retry once after authenticating. If this fails, we will die.
async with self.session.get(url) as request:
soup = soupify(await request.read())
soup = IliasSoup(soupify(await request.read()), str(request.url))
if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed)
raise CrawlError(f"get_page failed even after authenticating on {url!r}")
@staticmethod
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
def _verify_page(soup: IliasSoup, url: str, root_page_allowed: bool) -> IliasSoup:
if IliasPage.is_root_page(soup) and not root_page_allowed:
raise CrawlError(
"Unexpectedly encountered ILIAS root page. "
@ -967,29 +1039,19 @@ instance's greatest bottleneck.
)
return soup
async def _post_authenticated(
async def _post(
self,
url: str,
data: dict[str, Union[str, List[str]]]
) -> bytes:
auth_id = await self._current_auth_id()
form_data = aiohttp.FormData()
for key, val in data.items():
form_data.add_field(key, val)
async with self.session.post(url, data=form_data(), allow_redirects=False) as request:
async with self.session.post(url, data=form_data()) as request:
if request.status == 200:
return await request.read()
# We weren't authenticated, so try to do that
await self.authenticate(auth_id)
# Retry once after authenticating. If this fails, we will die.
async with self.session.post(url, data=data, allow_redirects=False) as request:
if request.status == 200:
return await request.read()
raise CrawlError("post_authenticated failed even after authenticating")
raise CrawlError(f"post failed with status {request.status}")
async def _get_authenticated(self, url: str) -> bytes:
auth_id = await self._current_auth_id()
@ -1019,7 +1081,7 @@ instance's greatest bottleneck.
async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request:
login_page = soupify(await request.read())
login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"}))
login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "login_form"}))
if login_form is None:
raise CrawlError("Could not find the login form! Specified client id might be invalid.")
@ -1029,42 +1091,12 @@ instance's greatest bottleneck.
username, password = await self._auth.credentials()
login_data = {
"username": username,
"password": password,
"cmd[doStandardAuthentication]": "Login",
}
login_form_data = aiohttp.FormData()
login_form_data.add_field('login_form/input_3/input_4', username)
login_form_data.add_field('login_form/input_3/input_5', password)
# do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request:
soup = soupify(await request.read())
if not self._is_logged_in(soup):
async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request:
soup = IliasSoup(soupify(await request.read()), str(request.url))
if not IliasPage.is_logged_in(soup):
self._auth.invalidate_credentials()
@staticmethod
def _is_logged_in(soup: BeautifulSoup) -> bool:
# Normal ILIAS pages
mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
if mainbar is not None:
login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x})
shib_login = soup.find(id="button_shib_login")
return not login_button and not shib_login
# Personal Desktop
if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}):
return True
# Video listing embeds do not have complete ILIAS html. Try to match them by
# their video listing table
video_table = soup.find(
recursive=True,
name="table",
attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
)
if video_table is not None:
return True
# The individual video player wrapper page has nothing of the above.
# Match it by its playerContainer.
if soup.select_one("#playerContainer") is not None:
return True
return False

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,8 @@
import asyncio
import sys
import traceback
from contextlib import asynccontextmanager, contextmanager
# TODO In Python 3.9 and above, ContextManager is deprecated
from typing import AsyncIterator, ContextManager, Iterator, List, Optional
from contextlib import AbstractContextManager, asynccontextmanager, contextmanager
from typing import AsyncIterator, Iterator, List, Optional
from rich.console import Console, Group
from rich.live import Live
@ -261,7 +260,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
action: str,
text: str,
total: Optional[float] = None,
) -> ContextManager[ProgressBar]:
) -> AbstractContextManager[ProgressBar]:
"""
Allows markup in the "style" argument which will be applied to the
"action" string.
@ -277,7 +276,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
action: str,
text: str,
total: Optional[float] = None,
) -> ContextManager[ProgressBar]:
) -> AbstractContextManager[ProgressBar]:
"""
Allows markup in the "style" argument which will be applied to the
"action" string.

View File

@ -34,15 +34,6 @@ class MarkConflictError(Exception):
self.collides_with = collides_with
# TODO Use PurePath.is_relative_to when updating to 3.9
def is_relative_to(a: PurePath, b: PurePath) -> bool:
try:
a.relative_to(b)
return True
except ValueError:
return False
class Report:
"""
A report of a synchronization. Includes all files found by the crawler, as
@ -173,7 +164,7 @@ class Report:
if path == other:
raise MarkDuplicateError(path)
if is_relative_to(path, other) or is_relative_to(other, path):
if path.is_relative_to(other) or other.is_relative_to(path):
raise MarkConflictError(path, other)
self.known_files.add(path)

View File

@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.7.0"
VERSION = "3.8.3"

View File

@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the
### With pip
Ensure you have at least Python 3.9 installed. Run the following command to
Ensure you have at least Python 3.11 installed. Run the following command to
install PFERD or upgrade it to the latest version:
```

8
flake.lock generated
View File

@ -2,16 +2,16 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1708979614,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
"lastModified": 1744440957,
"narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
"rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-23.11",
"ref": "nixos-24.11",
"repo": "nixpkgs",
"type": "github"
}

View File

@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
};
outputs = { self, nixpkgs }:

View File

@ -12,7 +12,7 @@ dependencies = [
"certifi>=2021.10.8"
]
dynamic = ["version"]
requires-python = ">=3.9"
requires-python = ">=3.11"
[project.scripts]
pferd = "PFERD.__main__:main"