Bump version to 3.8.0

Update minimum Python version to 3.11
Support ILIAS 9
2025-07-12 22:22:30 +02:00 · 2025-04-15 11:32:22 +02:00 · 2025-04-15 11:31:39 +02:00 · 2025-04-15 11:19:53 +02:00 · 2025-03-09 23:44:25 +01:00 · 2025-02-19 16:23:20 +01:00
20 changed files with 1055 additions and 567 deletions
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@ -14,7 +14,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
-        python: ["3.9"]
+        python: ["3.11"]
    steps:
      - uses: actions/checkout@v4

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,20 @@ ambiguous situations.

 ## Unreleased

+## 3.8.0 - 2025-04-15
+
+### Added
+- Support for ILIAS 9
+
+### Changed
+- Added prettier CSS to forum threads
+- Increase minimum supported Python version to 3.11
+
+## Fixed
+- File links in report on Windows
+- TOTP authentication in KIT Shibboleth
+- Forum crawling only considering the first 20 entries
+
 ## 3.7.0 - 2024-11-13

 ### Added
--- a/PFERD/auth/keyring.py
+++ b/PFERD/auth/keyring.py
@ -1,4 +1,4 @@
-from typing import Optional, Tuple
+from typing import Optional, Tuple, cast

 import keyring

@ -13,7 +13,7 @@ class KeyringAuthSection(AuthSection):
        return self.s.get("username")

    def keyring_name(self) -> str:
-        return self.s.get("keyring_name", fallback=NAME)
+        return cast(str, self.s.get("keyring_name", fallback=NAME))


 class KeyringAuthenticator(Authenticator):
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@ -149,9 +149,7 @@ class CrawlerSection(Section):
        return self.s.getboolean("skip", fallback=False)

    def output_dir(self, name: str) -> Path:
-        # TODO Use removeprefix() after switching to 3.9
-        if name.startswith("crawl:"):
-            name = name[len("crawl:"):]
+        name = name.removeprefix("crawl:")
        return Path(self.s.get("output_dir", name)).expanduser()

    def redownload(self) -> Redownload:
@ -294,6 +292,35 @@ class Crawler(ABC):
        log.explain("Answer: Yes")
        return CrawlToken(self._limiter, path)

+    def should_try_download(
+            self,
+            path: PurePath,
+            *,
+            etag_differs: Optional[bool] = None,
+            mtime: Optional[datetime] = None,
+            redownload: Optional[Redownload] = None,
+            on_conflict: Optional[OnConflict] = None,
+    ) -> bool:
+        log.explain_topic(f"Decision: Should Download {fmt_path(path)}")
+
+        if self._transformer.transform(path) is None:
+            log.explain("Answer: No (ignored)")
+            return False
+
+        should_download = self._output_dir.should_try_download(
+            path,
+            etag_differs=etag_differs,
+            mtime=mtime,
+            redownload=redownload,
+            on_conflict=on_conflict
+        )
+        if should_download:
+            log.explain("Answer: Yes")
+            return True
+        else:
+            log.explain("Answer: No")
+            return False
+
    async def download(
            self,
            path: PurePath,
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@ -3,7 +3,7 @@ import http.cookies
 import ssl
 from datetime import datetime
 from pathlib import Path, PurePath
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, cast

 import aiohttp
 import certifi
@ -22,7 +22,7 @@ ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"

 class HttpCrawlerSection(CrawlerSection):
    def http_timeout(self) -> float:
-        return self.s.getfloat("http_timeout", fallback=20)
+        return self.s.getfloat("http_timeout", fallback=30)


 class HttpCrawler(Crawler):
@ -187,12 +187,12 @@ class HttpCrawler(Crawler):
            if level == 0 or (level == 1 and drop_h1):
                return PurePath()

-            level_heading = tag.find_previous(name=f"h{level}")
+            level_heading = cast(Optional[Tag], tag.find_previous(name=f"h{level}"))

            if level_heading is None:
                return find_associated_headings(tag, level - 1)

-            folder_name = level_heading.getText().strip()
+            folder_name = level_heading.get_text().strip()
            return find_associated_headings(level_heading, level - 1) / folder_name

        # start at level <h3> because paragraph-level headings are usually too granular for folder names
@ -231,6 +231,7 @@ class HttpCrawler(Crawler):

                etag_header = resp.headers.get("ETag")
                last_modified_header = resp.headers.get("Last-Modified")
+                last_modified = None

                if last_modified_header:
                    try:
--- a/PFERD/crawl/ilias/file_templates.py
+++ b/PFERD/crawl/ilias/file_templates.py
@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Optional
+from typing import Optional, cast

 import bs4

@ -126,6 +126,88 @@ _learning_module_template = """
 </html>
 """

+_forum_thread_template = """
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <title>ILIAS - Forum: {{name}}</title>
+        <style>
+            * {
+                box-sizing: border-box;
+            }
+            body {
+                font-family: 'Open Sans', Verdana, Arial, Helvetica, sans-serif;
+                padding: 8px;
+            }
+            ul, ol, p {
+                margin: 1.2em 0;
+            }
+            p {
+                margin-top: 8px;
+                margin-bottom: 8px;
+            }
+            a {
+                color: #00876c;
+                text-decoration: none;
+                cursor: pointer;
+            }
+            a:hover {
+                text-decoration: underline;
+            }
+            body > p:first-child > span:first-child {
+                font-size: 1.6em;
+            }
+            body > p:first-child > span:first-child ~ span.default {
+                display: inline-block;
+                font-size: 1.2em;
+                padding-bottom: 8px;
+            }
+            .ilFrmPostContent {
+                margin-top: 8px;
+                max-width: 64em;
+            }
+            .ilFrmPostContent > *:first-child {
+                margin-top: 0px;
+            }
+            .ilFrmPostTitle {
+                margin-top: 24px;
+                color: #00876c;
+                font-weight: bold;
+            }
+            #ilFrmPostList {
+                list-style: none;
+                padding-left: 0;
+            }
+            li.ilFrmPostRow {
+                padding: 3px 0 3px 3px;
+                margin-bottom: 24px;
+                border-left: 6px solid #dddddd;
+            }
+            .ilFrmPostRow > div {
+                display: flex;
+            }
+            .ilFrmPostImage img {
+                margin: 0 !important;
+                padding: 6px 9px 9px 6px;
+            }
+            .ilUserIcon {
+                width: 115px;
+            }
+            .small {
+                text-decoration: none;
+                font-size: 0.75rem;
+                color: #6f6f6f;
+            }
+        </style>
+    </head>
+    <body>
+    {{heading}}
+    {{content}}
+    </body>
+</html>
+""".strip()  # noqa: E501 line too long
+

 def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str:
    # Seems to be comments, ignore those.
@ -139,13 +221,13 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next
        </div>
    """
    if prev and body.select_one(".ilc_page_lnav_LeftNavigation"):
-        text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip()
+        text = cast(bs4.Tag, body.select_one(".ilc_page_lnav_LeftNavigation")).get_text().strip()
        left = f'<a href="{prev}">{text}</a>'
    else:
        left = "<span></span>"

    if next and body.select_one(".ilc_page_rnav_RightNavigation"):
-        text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip()
+        text = cast(bs4.Tag, body.select_one(".ilc_page_rnav_RightNavigation")).get_text().strip()
        right = f'<a href="{next}">{text}</a>'
    else:
        right = "<span></span>"
@ -160,8 +242,17 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next
            "{{left}}", left).replace("{{right}}", right).encode())
        )

-    body = body.prettify()
-    return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name)
+    body_str = cast(str, body.prettify())
+    return _learning_module_template.replace("{{body}}", body_str).replace("{{name}}", name)
+
+
+def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Tag) -> str:
+    if title := cast(Optional[bs4.Tag], heading.find(name="b")):
+        title.wrap(bs4.Tag(name="a", attrs={"href": url}))
+    return _forum_thread_template \
+        .replace("{{name}}", name) \
+        .replace("{{heading}}", cast(str, heading.prettify())) \
+        .replace("{{content}}", cast(str, content.prettify()))


 class Links(Enum):
--- a/PFERD/crawl/ilias/ilias_html_cleaner.py
+++ b/PFERD/crawl/ilias/ilias_html_cleaner.py
@ -1,3 +1,5 @@
+from typing import cast
+
 from bs4 import BeautifulSoup, Comment, Tag

 _STYLE_TAG_CONTENT = """
@ -70,18 +72,18 @@ def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:


 def clean(soup: BeautifulSoup) -> BeautifulSoup:
-    for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES):
+    for block in cast(list[Tag], soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES)):
        block.name = "article"

-    for block in soup.find_all("h3"):
+    for block in cast(list[Tag], soup.find_all("h3")):
        block.name = "div"

-    for block in soup.find_all("h1"):
+    for block in cast(list[Tag], soup.find_all("h1")):
        block.name = "h3"

-    for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"):
+    for block in cast(list[Tag], soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap")):
        block.name = "h3"
-        block["class"] += ["accordion-head"]
+        block["class"] += ["accordion-head"]  # type: ignore

    for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"):
        children = list(dummy.children)
@ -97,7 +99,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup:
        if figure := video.find_parent("figure"):
            figure.decompose()

-    for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
+    for hrule_imposter in cast(list[Tag], soup.find_all(class_="ilc_section_Separator")):
        hrule_imposter.insert(0, soup.new_tag("hr"))

    return soup
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@ -19,10 +19,10 @@ from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .async_helper import _iorepeat
-from .file_templates import Links, learning_module_template
+from .file_templates import Links, forum_thread_template, learning_module_template
 from .ilias_html_cleaner import clean, insert_base_markup
 from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
-                             IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
+                             IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
 from .shibboleth_login import ShibbolethLogin

 TargetType = Union[str, int]
@ -105,7 +105,6 @@ class IliasWebCrawlerSection(HttpCrawlerSection):


 _DIRECTORY_PAGES: Set[IliasElementType] = {
-    IliasElementType.COURSE,
    IliasElementType.EXERCISE,
    IliasElementType.EXERCISE_FILES,
    IliasElementType.FOLDER,
@ -257,6 +256,7 @@ instance's greatest bottleneck.
            async with cl:
                next_stage_url: Optional[str] = url
                current_parent = current_element
+                page = None

                while next_stage_url:
                    soup = await self._get_page(next_stage_url)
@ -266,18 +266,19 @@ instance's greatest bottleneck.
                    # If we expect to find a root course, enforce it
                    if current_parent is None and expected_course_id is not None:
                        perma_link = IliasPage.get_soup_permalink(soup)
-                        if not perma_link or "crs_" not in perma_link:
+                        if not perma_link or "crs/" not in perma_link:
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")
                        if str(expected_course_id) not in perma_link:
                            raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}")

-                    page = IliasPage(soup, next_stage_url, current_parent)
+                    page = IliasPage(soup, current_parent)
                    if next_element := page.get_next_stage_element():
                        current_parent = next_element
                        next_stage_url = next_element.url
                    else:
                        next_stage_url = None

+                page = cast(IliasPage, page)
                elements.extend(page.get_child_elements())
                if description_string := page.get_description():
                    description.append(description_string)
@ -360,6 +361,54 @@ instance's greatest bottleneck.
                "[bright_black](scorm learning modules are not supported)"
            )
            return None
+        elif element.type == IliasElementType.LITERATURE_LIST:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](literature lists are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.LEARNING_MODULE_HTML:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](HTML learning modules are not supported)"
+            )
+            return None
+        elif element.type == IliasElementType.BLOG:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](blogs are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.DCL_RECORD_LIST:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](dcl record lists are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.MEDIA_POOL:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](media pools are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.COURSE:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](not descending into linked course, download it separately)"
+            )
+            return None
        elif element.type == IliasElementType.LEARNING_MODULE:
            return await self._handle_learning_module(element, element_path)
        elif element.type == IliasElementType.LINK:
@ -461,10 +510,10 @@ instance's greatest bottleneck.
        if not dl:
            return

-        async with dl as (bar, sink):
+        async with dl as (_bar, sink):
            description = clean(insert_base_markup(description))
-            description = await self.internalize_images(description)
-            sink.file.write(description.prettify().encode("utf-8"))
+            description_tag = await self.internalize_images(description)
+            sink.file.write(cast(str, description_tag.prettify()).encode("utf-8"))
            sink.done()

    @anoncritical
@ -483,7 +532,7 @@ instance's greatest bottleneck.
            async with self.session.get(export_url, allow_redirects=False) as resp:
                # No redirect means we were authenticated
                if hdrs.LOCATION not in resp.headers:
-                    return soupify(await resp.read()).select_one("a").get("href").strip()
+                    return soupify(await resp.read()).select_one("a").get("href").strip()  # type: ignore
                # We are either unauthenticated or the link is not active
                new_url = resp.headers[hdrs.LOCATION].lower()
                if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
@ -588,7 +637,7 @@ instance's greatest bottleneck.
            )

        async with dl as (bar, sink):
-            page = IliasPage(await self._get_page(element.url), element.url, element)
+            page = IliasPage(await self._get_page(element.url), element)
            stream_elements = page.get_child_elements()

            if len(stream_elements) > 1:
@ -598,7 +647,7 @@ instance's greatest bottleneck.
                stream_element = stream_elements[0]

                # We do not have a local cache yet
-                await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
+                await self._stream_from_url(stream_element, sink, bar, is_video=True)
                add_to_report([str(self._transformer.transform(dl.path))])
                return

@ -613,7 +662,7 @@ instance's greatest bottleneck.
            async with maybe_dl as (bar, sink):
                log.explain(f"Streaming video from real url {stream_element.url}")
                contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
-                await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
+                await self._stream_from_url(stream_element, sink, bar, is_video=True)

        add_to_report(contained_video_paths)

@ -635,12 +684,19 @@ instance's greatest bottleneck.
    async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None:
        assert dl  # The function is only reached when dl is not None
        async with dl as (bar, sink):
-            await self._stream_from_url(element.url, sink, bar, is_video)
+            await self._stream_from_url(element, sink, bar, is_video)
+
+    async def _stream_from_url(
+        self,
+        element: IliasPageElement,
+        sink: FileSink,
+        bar: ProgressBar,
+        is_video: bool
+    ) -> None:
+        url = element.url

-    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
        async def try_stream() -> bool:
            next_url = url
-
            # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
            # we can not match on the content type here. Instead, we disallow redirects and inspect the
            # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
@ -688,7 +744,7 @@ instance's greatest bottleneck.
        await self.authenticate(auth_id)

        if not await try_stream():
-            raise CrawlError("File streaming failed after authenticate()")
+            raise CrawlError(f"File streaming failed after authenticate() {element!r}")

    async def _handle_forum(
        self,
@ -707,32 +763,66 @@ instance's greatest bottleneck.

        async with cl:
            next_stage_url = element.url
+            page = None
+
            while next_stage_url:
                log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
                log.explain(f"URL: {next_stage_url}")

                soup = await self._get_page(next_stage_url)
-                page = IliasPage(soup, next_stage_url, element)
+                page = IliasPage(soup, element)

                if next := page.get_next_stage_element():
                    next_stage_url = next.url
                else:
                    break

-            download_data = page.get_download_forum_data()
-            if not download_data:
-                raise CrawlWarning("Failed to extract forum data")
-            if download_data.empty:
+            forum_threads: list[tuple[IliasPageElement, bool]] = []
+            for entry in cast(IliasPage, page).get_forum_entries():
+                path = cl.path / (_sanitize_path_name(entry.name) + ".html")
+                forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
+
+            # Sort the ids. The forum download will *preserve* this ordering
+            forum_threads.sort(key=lambda elem: elem[0].id())
+
+            if not forum_threads:
                log.explain("Forum had no threads")
                return
+
+            download_data = cast(IliasPage, page).get_download_forum_data(
+                [thread.id() for thread, download in forum_threads if download]
+            )
+            if not download_data:
+                raise CrawlWarning("Failed to extract forum data")
+
+            if not download_data.empty:
                html = await self._post_authenticated(download_data.url, download_data.form_data)
                elements = parse_ilias_forum_export(soupify(html))
+            else:
+                elements = []

-        elements.sort(key=lambda elem: elem.title)
+        # Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
+        # download in the correct order, potentially messing up duplication handling.
+        expected_element_titles = [thread.name for thread, download in forum_threads if download]
+        actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
+        if expected_element_titles != actual_element_titles:
+            raise CrawlWarning(
+                f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
+            )

        tasks: List[Awaitable[None]] = []
-        for elem in elements:
-            tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
+        for thread, download in forum_threads:
+            if download:
+                # This only works because ILIAS keeps the order in the export
+                elem = elements.pop(0)
+                tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem, thread)))
+            else:
+                # We only downloaded the threads we "should_try_download"ed. This can be an
+                # over-approximation and all will be fine.
+                # If we selected too few, e.g. because there was a duplicate title and the mtime of the
+                # original is newer than the update of the duplicate.
+                # This causes stale data locally, but I consider this problem acceptable right now.
+                tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, thread)))

        # And execute them
        await self.gather(tasks)
@ -742,18 +832,22 @@ instance's greatest bottleneck.
    async def _download_forum_thread(
        self,
        parent_path: PurePath,
-        element: IliasForumThread,
+        thread: Union[IliasForumThread, IliasPageElement],
+        element: IliasPageElement
    ) -> None:
-        path = parent_path / (_sanitize_path_name(element.title) + ".html")
-        maybe_dl = await self.download(path, mtime=element.mtime)
-        if not maybe_dl:
+        path = parent_path / (_sanitize_path_name(thread.name) + ".html")
+        maybe_dl = await self.download(path, mtime=thread.mtime)
+        if not maybe_dl or not isinstance(thread, IliasForumThread):
            return

        async with maybe_dl as (bar, sink):
-            content = "<!DOCTYPE html>\n"
-            content += element.title_tag.prettify()
-            content += element.content_tag.prettify()
-            sink.file.write(content.encode("utf-8"))
+            rendered = forum_thread_template(
+                thread.name,
+                element.url,
+                thread.name_tag,
+                await self.internalize_images(thread.content_tag)
+            )
+            sink.file.write(rendered.encode("utf-8"))
            sink.done()

    async def _handle_learning_module(
@ -777,7 +871,7 @@ instance's greatest bottleneck.
            log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
            log.explain(f"URL: {element.url}")
            soup = await self._get_page(element.url)
-            page = IliasPage(soup, element.url, element)
+            page = IliasPage(soup, element)
            if next := page.get_learning_module_data():
                elements.extend(await self._crawl_learning_module_direction(
                    cl.path, next.previous_url, "left", element
@ -820,7 +914,7 @@ instance's greatest bottleneck.
            log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
            log.explain(f"URL: {next_element_url}")
            soup = await self._get_page(next_element_url)
-            page = IliasPage(soup, next_element_url, parent_element)
+            page = IliasPage(soup, parent_element)
            if next := page.get_learning_module_data():
                elements.append(next)
                if dir == "left":
@ -851,13 +945,13 @@ instance's greatest bottleneck.
        if prev:
            prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
            if prev_p:
-                prev = os.path.relpath(prev_p, my_path.parent)
+                prev = cast(str, os.path.relpath(prev_p, my_path.parent))
            else:
                prev = None
        if next:
            next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
            if next_p:
-                next = os.path.relpath(next_p, my_path.parent)
+                next = cast(str, os.path.relpath(next_p, my_path.parent))
            else:
                next = None

@ -877,15 +971,15 @@ instance's greatest bottleneck.
                continue
            if elem.name == "img":
                if src := elem.attrs.get("src", None):
-                    url = urljoin(self._base_url, src)
+                    url = urljoin(self._base_url, cast(str, src))
                    if not url.startswith(self._base_url):
                        continue
                    log.explain(f"Internalizing {url!r}")
                    img = await self._get_authenticated(url)
                    elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
-            if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
+            if elem.name == "iframe" and cast(str, elem.attrs.get("src", "")).startswith("//"):
                # For unknown reasons the protocol seems to be stripped.
-                elem.attrs["src"] = "https:" + elem.attrs["src"]
+                elem.attrs["src"] = "https:" + cast(str, elem.attrs["src"])
        return tag

    def _ensure_not_seen(self, element: IliasPageElement, parent_path: PurePath) -> None:
@ -897,10 +991,10 @@ instance's greatest bottleneck.
            )
        self._visited_urls[element.url] = parent_path

-    async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
+    async def _get_page(self, url: str, root_page_allowed: bool = False) -> IliasSoup:
        auth_id = await self._current_auth_id()
        async with self.session.get(url) as request:
-            soup = soupify(await request.read())
+            soup = IliasSoup(soupify(await request.read()), str(request.url))
            if IliasPage.is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)

@ -909,13 +1003,13 @@ instance's greatest bottleneck.

        # Retry once after authenticating. If this fails, we will die.
        async with self.session.get(url) as request:
-            soup = soupify(await request.read())
+            soup = IliasSoup(soupify(await request.read()), str(request.url))
            if IliasPage.is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)
        raise CrawlError(f"get_page failed even after authenticating on {url!r}")

    @staticmethod
-    def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
+    def _verify_page(soup: IliasSoup, url: str, root_page_allowed: bool) -> IliasSoup:
        if IliasPage.is_root_page(soup) and not root_page_allowed:
            raise CrawlError(
                "Unexpectedly encountered ILIAS root page. "
@ -979,11 +1073,11 @@ instance's greatest bottleneck.
            async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request:
                login_page = soupify(await request.read())

-            login_form = login_page.find("form", attrs={"name": "formlogin"})
+            login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"}))
            if login_form is None:
                raise CrawlError("Could not find the login form! Specified client id might be invalid.")

-            login_url = login_form.attrs.get("action")
+            login_url = cast(Optional[str], login_form.attrs.get("action"))
            if login_url is None:
                raise CrawlError("Could not find the action URL in the login form!")

@ -997,34 +1091,6 @@ instance's greatest bottleneck.

            # do the actual login
            async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request:
-                soup = soupify(await request.read())
-                if not self._is_logged_in(soup):
+                soup = IliasSoup(soupify(await request.read()), str(request.url))
+                if not IliasPage.is_logged_in(soup):
                    self._auth.invalidate_credentials()
-
-    @staticmethod
-    def _is_logged_in(soup: BeautifulSoup) -> bool:
-        # Normal ILIAS pages
-        mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
-        if mainbar is not None:
-            login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
-            shib_login = soup.find(id="button_shib_login")
-            return not login_button and not shib_login
-
-        # Personal Desktop
-        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
-            return True
-
-        # Video listing embeds do not have complete ILIAS html. Try to match them by
-        # their video listing table
-        video_table = soup.find(
-            recursive=True,
-            name="table",
-            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
-        )
-        if video_table is not None:
-            return True
-        # The individual video player wrapper page has nothing of the above.
-        # Match it by its playerContainer.
-        if soup.select_one("#playerContainer") is not None:
-            return True
-        return False
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
--- a/PFERD/crawl/ilias/shibboleth_login.py
+++ b/PFERD/crawl/ilias/shibboleth_login.py
@ -1,8 +1,8 @@
-from typing import Any, Optional
+from typing import Any, Optional, cast

 import aiohttp
 import yarl
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag

 from ...auth import Authenticator, TfaAuthenticator
 from ...logging import log
@ -48,8 +48,8 @@ class ShibbolethLogin:
        while not self._login_successful(soup):
            # Searching the form here so that this fails before asking for
            # credentials rather than after asking.
-            form = soup.find("form", {"method": "post"})
-            action = form["action"]
+            form = cast(Tag, soup.find("form", {"method": "post"}))
+            action = cast(str, form["action"])

            # Equivalent: Enter credentials in
            # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
@ -59,9 +59,10 @@ class ShibbolethLogin:
                "_eventId_proceed": "",
                "j_username": username,
                "j_password": password,
+                "fudis_web_authn_assertion_input": "",
            }
            if csrf_token_input := form.find("input", {"name": "csrf_token"}):
-                data["csrf_token"] = csrf_token_input["value"]
+                data["csrf_token"] = csrf_token_input["value"]  # type: ignore
            soup = await _post(sess, url, data)

            if soup.find(id="attributeRelease"):
@ -78,14 +79,14 @@ class ShibbolethLogin:

        # Equivalent: Being redirected via JS automatically
        # (or clicking "Continue" if you have JS disabled)
-        relay_state = soup.find("input", {"name": "RelayState"})
-        saml_response = soup.find("input", {"name": "SAMLResponse"})
-        url = form = soup.find("form", {"method": "post"})["action"]
+        relay_state = cast(Tag, soup.find("input", {"name": "RelayState"}))
+        saml_response = cast(Tag, soup.find("input", {"name": "SAMLResponse"}))
+        url = form = soup.find("form", {"method": "post"})["action"]  # type: ignore
        data = {  # using the info obtained in the while loop above
-            "RelayState": relay_state["value"],
-            "SAMLResponse": saml_response["value"],
+            "RelayState": cast(str, relay_state["value"]),
+            "SAMLResponse": cast(str, saml_response["value"]),
        }
-        await sess.post(url, data=data)
+        await sess.post(cast(str, url), data=data)

    async def _authenticate_tfa(
        self, session: aiohttp.ClientSession, soup: BeautifulSoup, shib_url: yarl.URL
@ -97,8 +98,8 @@ class ShibbolethLogin:

        # Searching the form here so that this fails before asking for
        # credentials rather than after asking.
-        form = soup.find("form", {"method": "post"})
-        action = form["action"]
+        form = cast(Tag, soup.find("form", {"method": "post"}))
+        action = cast(str, form["action"])

        # Equivalent: Enter token in
        # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
@ -106,10 +107,10 @@ class ShibbolethLogin:
        username, password = await self._auth.credentials()
        data = {
            "_eventId_proceed": "",
-            "j_tokenNumber": tfa_token,
+            "fudis_otp_input": tfa_token,
        }
        if csrf_token_input := form.find("input", {"name": "csrf_token"}):
-            data["csrf_token"] = csrf_token_input["value"]
+            data["csrf_token"] = csrf_token_input["value"]  # type: ignore
        return await _post(session, url, data)

    @staticmethod
@ -120,7 +121,7 @@ class ShibbolethLogin:

    @staticmethod
    def _tfa_required(soup: BeautifulSoup) -> bool:
-        return soup.find(id="j_tokenNumber") is not None
+        return soup.find(id="fudiscr-form") is not None


 async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@ -3,7 +3,7 @@ import re
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import PurePath
-from typing import Any, Awaitable, Generator, Iterable, List, Optional, Pattern, Tuple, Union
+from typing import Any, Awaitable, Generator, Iterable, List, Optional, Pattern, Tuple, Union, cast
 from urllib.parse import urljoin

 from bs4 import BeautifulSoup, Tag
@ -156,11 +156,11 @@ class KitIpdCrawler(HttpCrawler):
        name = os.path.basename(url)
        return KitIpdFile(name, url)

-    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
-        return tag.findAll(name="a", attrs={"href": self._file_regex})
+    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> list[Tag]:
+        return cast(list[Tag], tag.find_all(name="a", attrs={"href": self._file_regex}))

    def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
-        return urljoin(url, link_tag.get("href"))
+        return urljoin(url, cast(str, link_tag.get("href")))

    async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
--- a/PFERD/logging.py
+++ b/PFERD/logging.py
@ -1,9 +1,8 @@
 import asyncio
 import sys
 import traceback
-from contextlib import asynccontextmanager, contextmanager
-# TODO In Python 3.9 and above, ContextManager is deprecated
-from typing import AsyncIterator, ContextManager, Iterator, List, Optional
+from contextlib import AbstractContextManager, asynccontextmanager, contextmanager
+from typing import AsyncIterator, Iterator, List, Optional

 from rich.console import Console, Group
 from rich.live import Live
@ -261,7 +260,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
            action: str,
            text: str,
            total: Optional[float] = None,
-    ) -> ContextManager[ProgressBar]:
+    ) -> AbstractContextManager[ProgressBar]:
        """
        Allows markup in the "style" argument which will be applied to the
        "action" string.
@ -277,7 +276,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
            action: str,
            text: str,
            total: Optional[float] = None,
-    ) -> ContextManager[ProgressBar]:
+    ) -> AbstractContextManager[ProgressBar]:
        """
        Allows markup in the "style" argument which will be applied to the
        "action" string.
--- a/PFERD/output_dir.py
+++ b/PFERD/output_dir.py
@ -371,6 +371,22 @@ class OutputDirectory:

        raise OutputDirError("Failed to create temporary file")

+    def should_try_download(
+        self,
+        path: PurePath,
+        *,
+        etag_differs: Optional[bool] = None,
+        mtime: Optional[datetime] = None,
+        redownload: Optional[Redownload] = None,
+        on_conflict: Optional[OnConflict] = None,
+    ) -> bool:
+        heuristics = Heuristics(etag_differs, mtime)
+        redownload = self._redownload if redownload is None else redownload
+        on_conflict = self._on_conflict if on_conflict is None else on_conflict
+        local_path = self.resolve(path)
+
+        return self._should_download(local_path, heuristics, redownload, on_conflict)
+
    async def download(
            self,
            remote_path: PurePath,
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@ -1,6 +1,5 @@
 from pathlib import Path, PurePath
 from typing import Dict, List, Optional
-from urllib.parse import quote

 from rich.markup import escape

@ -171,9 +170,7 @@ class Pferd:

            def fmt_path_link(relative_path: PurePath) -> str:
                # We need to URL-encode the path because it might contain spaces or special characters
-                absolute_path = str(crawler.output_dir.resolve(relative_path).absolute())
-                absolute_path = absolute_path.replace("\\\\?\\", "")
-                link = f"file://{quote(absolute_path)}"
+                link = crawler.output_dir.resolve(relative_path).absolute().as_uri()
                return f"[link={link}]{fmt_path(relative_path)}[/link]"

            something_changed = False
--- a/PFERD/report.py
+++ b/PFERD/report.py
@ -34,15 +34,6 @@ class MarkConflictError(Exception):
        self.collides_with = collides_with


-# TODO Use PurePath.is_relative_to when updating to 3.9
-def is_relative_to(a: PurePath, b: PurePath) -> bool:
-    try:
-        a.relative_to(b)
-        return True
-    except ValueError:
-        return False
-
-
 class Report:
    """
    A report of a synchronization. Includes all files found by the crawler, as
@ -173,7 +164,7 @@ class Report:
            if path == other:
                raise MarkDuplicateError(path)

-            if is_relative_to(path, other) or is_relative_to(other, path):
+            if path.is_relative_to(other) or other.is_relative_to(path):
                raise MarkConflictError(path, other)

        self.known_files.add(path)
--- a/PFERD/version.py
+++ b/PFERD/version.py
@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.7.0"
+VERSION = "3.8.0"
--- a/README.md
+++ b/README.md
@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the

 ### With pip

-Ensure you have at least Python 3.9 installed. Run the following command to
+Ensure you have at least Python 3.11 installed. Run the following command to
 install PFERD or upgrade it to the latest version:

 ```
--- a/flake.lock
+++ b/flake.lock
@ -2,16 +2,16 @@
  "nodes": {
    "nixpkgs": {
      "locked": {
-        "lastModified": 1708979614,
-        "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
+        "lastModified": 1744440957,
+        "narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
+        "rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-23.11",
+        "ref": "nixos-24.11",
        "repo": "nixpkgs",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -2,7 +2,7 @@
  description = "Tool for downloading course-related files from ILIAS";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
  };

  outputs = { self, nixpkgs }:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,7 +12,7 @@ dependencies = [
  "certifi>=2021.10.8"
 ]
 dynamic = ["version"]
-requires-python = ">=3.9"
+requires-python = ">=3.11"

 [project.scripts]
 pferd = "PFERD.__main__:main"
Author	SHA1	Message	Date
I-Al-Istannen	7291382430	Bump version to 3.8.0	2025-04-15 11:32:22 +02:00
I-Al-Istannen	1a430ad5d1	Update minimum Python version to 3.11	2025-04-15 11:31:39 +02:00
I-Al-Istannen	f6bdeb6b9d	Support ILIAS 9	2025-04-15 11:19:53 +02:00
I-Al-Istannen	63f25277b0	Fix crawling of empty forum threads	2025-03-09 23:44:25 +01:00
I-Al-Istannen	c8eff04ae0	Make thread titles link to original ILIAS thread	2025-02-19 16:23:20 +01:00
I-Al-Istannen	edc482cdf4	Internalize images in forum threads	2025-02-19 16:23:20 +01:00
I-Al-Istannen	72cd0f77e2	Prettify forum thread exports Co-authored-by: Tim <me@scriptim.dev>	2025-02-19 16:23:20 +01:00
I-Al-Istannen	be175f9347	Download only new/updated forum threads	2025-02-19 16:16:37 +01:00
I-Al-Istannen	ba2833dba5	Crawl all threads in a forum Before this patch the row count was unconditionally changed to 800. This patch tries to detect how many rows the forum has and then fetches this amount, if it is larger than 800.	2025-02-19 12:19:33 +01:00
I-Al-Istannen	2f0e792670	Increase default http timeout to 30 Otherwise larger forums will fail to download in time	2025-02-19 12:19:13 +01:00
I-Al-Istannen	5f88539f7e	Fix page size increase for forum threads	2025-02-19 12:19:11 +01:00
I-Al-Istannen	bd9d7efe64	"Fix" mypy errors Thank you mypy, very cool. These types make things so much better. They don't just complicate everything and don't really help because they can not detect that an element queried by a tag is no navigable string...	2025-02-19 12:15:41 +01:00
Aurelia	16a2dd5b15	fix: totp	2025-02-19 12:15:41 +01:00
I-Al-Istannen	678283d341	Use Python facilities to convert paths to file:// urls	2024-11-15 00:09:11 +01:00