Fix ruff errors

2026-02-01 15:02:24 +01:00 · 2025-10-19 15:25:40 +02:00
parent 2cf0e060ed
commit 6e563134b2
26 changed files with 194 additions and 209 deletions
--- a/PFERD/crawl/ilias/async_helper.py
+++ b/PFERD/crawl/ilias/async_helper.py
@@ -1,5 +1,6 @@
 import asyncio
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional

 import aiohttp

@@ -15,9 +16,9 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
                try:
                    return await f(*args, **kwargs)
                except aiohttp.ContentTypeError:  # invalid content type
-                    raise CrawlWarning("ILIAS returned an invalid content type")
+                    raise CrawlWarning("ILIAS returned an invalid content type") from None
                except aiohttp.TooManyRedirects:
-                    raise CrawlWarning("Got stuck in a redirect loop")
+                    raise CrawlWarning("Got stuck in a redirect loop") from None
                except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes
                    last_exception = e
                except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc.
--- a/PFERD/crawl/ilias/file_templates.py
+++ b/PFERD/crawl/ilias/file_templates.py
@@ -297,9 +297,7 @@ class Links(Enum):
        raise ValueError("Missing switch case")

    def collection_as_one(self) -> bool:
-        if self == Links.FANCY:
-            return True
-        return False
+        return self == Links.FANCY

    def extension(self) -> Optional[str]:
        if self == Links.FANCY:
@@ -355,4 +353,4 @@ class Links(Enum):
            return Links(string)
        except ValueError:
            options = [f"'{option.value}'" for option in Links]
-            raise ValueError(f"must be one of {', '.join(options)}")
+            raise ValueError(f"must be one of {', '.join(options)}") from None
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -4,7 +4,7 @@ import os
 import re
 from collections.abc import Awaitable, Coroutine
 from pathlib import PurePath
-from typing import Any, Dict, List, Literal, Optional, Set, Union, cast
+from typing import Any, Literal, Optional, cast
 from urllib.parse import urljoin

 import aiohttp
@@ -33,7 +33,7 @@ from .kit_ilias_html import (
 )
 from .shibboleth_login import ShibbolethLogin

-TargetType = Union[str, int]
+TargetType = str | int


 class LoginTypeLocal:
@@ -49,7 +49,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):

        return base_url

-    def login(self) -> Union[Literal["shibboleth"], LoginTypeLocal]:
+    def login(self) -> Literal["shibboleth"] | LoginTypeLocal:
        login_type = self.s.get("login_type")
        if not login_type:
            self.missing_value("login_type")
@@ -63,7 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):

        self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")

-    def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
+    def tfa_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
        value: Optional[str] = self.s.get("tfa_auth")
        if value is None:
            return None
@@ -110,7 +110,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
        return self.s.getboolean("forums", fallback=False)


-_DIRECTORY_PAGES: Set[IliasElementType] = {
+_DIRECTORY_PAGES: set[IliasElementType] = {
    IliasElementType.EXERCISE,
    IliasElementType.EXERCISE_FILES,
    IliasElementType.EXERCISE_OVERVIEW,
@@ -122,7 +122,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = {
    IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED,
 }

-_VIDEO_ELEMENTS: Set[IliasElementType] = {
+_VIDEO_ELEMENTS: set[IliasElementType] = {
    IliasElementType.MEDIACAST_VIDEO,
    IliasElementType.MEDIACAST_VIDEO_FOLDER,
    IliasElementType.OPENCAST_VIDEO,
@@ -172,7 +172,7 @@ class IliasWebCrawler(HttpCrawler):
        name: str,
        section: IliasWebCrawlerSection,
        config: Config,
-        authenticators: Dict[str, Authenticator],
+        authenticators: dict[str, Authenticator],
    ):
        # Setting a main authenticator for cookie sharing
        auth = section.auth(authenticators)
@@ -201,7 +201,7 @@ instance's greatest bottleneck.
        self._links = section.links()
        self._videos = section.videos()
        self._forums = section.forums()
-        self._visited_urls: Dict[str, PurePath] = dict()
+        self._visited_urls: dict[str, PurePath] = dict()

    async def _run(self) -> None:
        if isinstance(self._target, int):
@@ -264,9 +264,9 @@ instance's greatest bottleneck.
        expected_course_id: Optional[int] = None,
        crawl_nested_courses: bool = False,
    ) -> None:
-        elements: List[IliasPageElement] = []
+        elements: list[IliasPageElement] = []
        # A list as variable redefinitions are not propagated to outer scopes
-        description: List[BeautifulSoup] = []
+        description: list[BeautifulSoup] = []

        @_iorepeat(3, "crawling folder")
        async def gather_elements() -> None:
@@ -309,7 +309,7 @@ instance's greatest bottleneck.

        elements.sort(key=lambda e: e.id())

-        tasks: List[Awaitable[None]] = []
+        tasks: list[Awaitable[None]] = []
        for element in elements:
            if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses):
                tasks.append(asyncio.create_task(handle))
@@ -340,15 +340,14 @@ instance's greatest bottleneck.
            )
            return None

-        if element.type in _VIDEO_ELEMENTS:
-            if not self._videos:
-                log.status(
-                    "[bold bright_black]",
-                    "Ignored",
-                    fmt_path(element_path),
-                    "[bright_black](enable with option 'videos')",
-                )
-                return None
+        if element.type in _VIDEO_ELEMENTS and not self._videos:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](enable with option 'videos')",
+            )
+            return None

        if element.type == IliasElementType.FILE:
            return await self._handle_file(element, element_path)
@@ -522,8 +521,8 @@ instance's greatest bottleneck.
            sink.file.write(rendered.encode("utf-8"))
            sink.done()

-    async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal["none"]]:
-        async def impl() -> Optional[Union[BeautifulSoup, Literal["none"]]]:
+    async def _resolve_link_target(self, export_url: str) -> BeautifulSoup | Literal["none"]:
+        async def impl() -> Optional[BeautifulSoup | Literal["none"]]:
            async with self.session.get(export_url, allow_redirects=False) as resp:
                # No redirect means we were authenticated
                if hdrs.LOCATION not in resp.headers:
@@ -658,7 +657,7 @@ instance's greatest bottleneck.

    def _previous_contained_opencast_videos(
        self, element: IliasPageElement, element_path: PurePath
-    ) -> List[PurePath]:
+    ) -> list[PurePath]:
        if not self.prev_report:
            return []
        custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
@@ -714,7 +713,7 @@ instance's greatest bottleneck.
                add_to_report([str(self._transformer.transform(dl.path))])
                return

-        contained_video_paths: List[str] = []
+        contained_video_paths: list[str] = []

        for stream_element in stream_elements:
            video_path = dl.path.parent / stream_element.name
@@ -832,7 +831,7 @@ instance's greatest bottleneck.

            elements = parse_ilias_forum_export(soupify(export))

-        tasks: List[Awaitable[None]] = []
+        tasks: list[Awaitable[None]] = []
        for thread in elements:
            tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url)))

@@ -842,7 +841,7 @@ instance's greatest bottleneck.
    @anoncritical
    @_iorepeat(3, "saving forum thread")
    async def _download_forum_thread(
-        self, parent_path: PurePath, thread: Union[IliasForumThread, IliasPageElement], forum_url: str
+        self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
    ) -> None:
        path = parent_path / (_sanitize_path_name(thread.name) + ".html")
        maybe_dl = await self.download(path, mtime=thread.mtime)
@@ -871,7 +870,7 @@ instance's greatest bottleneck.
    @_iorepeat(3, "crawling learning module")
    @anoncritical
    async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None:
-        elements: List[IliasLearningModulePage] = []
+        elements: list[IliasLearningModulePage] = []

        async with cl:
            log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
@@ -891,7 +890,7 @@ instance's greatest bottleneck.
        for index, lm_element in enumerate(elements):
            lm_element.title = f"{index:02}_{lm_element.title}"

-        tasks: List[Awaitable[None]] = []
+        tasks: list[Awaitable[None]] = []
        for index, elem in enumerate(elements):
            prev_url = elements[index - 1].title if index > 0 else None
            next_url = elements[index + 1].title if index < len(elements) - 1 else None
@@ -906,10 +905,10 @@ instance's greatest bottleneck.
        self,
        path: PurePath,
        start_url: Optional[str],
-        dir: Union[Literal["left"], Literal["right"]],
+        dir: Literal["left"] | Literal["right"],
        parent_element: IliasPageElement,
-    ) -> List[IliasLearningModulePage]:
-        elements: List[IliasLearningModulePage] = []
+    ) -> list[IliasLearningModulePage]:
+        elements: list[IliasLearningModulePage] = []

        if not start_url:
            return elements
@@ -923,10 +922,7 @@ instance's greatest bottleneck.
            page = IliasPage(soup, parent_element)
            if next := page.get_learning_module_data():
                elements.append(next)
-                if dir == "left":
-                    next_element_url = next.previous_url
-                else:
-                    next_element_url = next.next_url
+                next_element_url = next.previous_url if dir == "left" else next.next_url
            counter += 1

        return elements
@@ -950,16 +946,10 @@ instance's greatest bottleneck.

        if prev:
            prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
-            if prev_p:
-                prev = cast(str, os.path.relpath(prev_p, my_path.parent))
-            else:
-                prev = None
+            prev = cast(str, os.path.relpath(prev_p, my_path.parent)) if prev_p else None
        if next:
            next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
-            if next_p:
-                next = cast(str, os.path.relpath(next_p, my_path.parent))
-            else:
-                next = None
+            next = cast(str, os.path.relpath(next_p, my_path.parent)) if next_p else None

        async with maybe_dl as (bar, sink):
            content = element.content
@@ -973,14 +963,13 @@ instance's greatest bottleneck.
        """
        log.explain_topic("Internalizing images")
        for elem in tag.find_all(recursive=True):
-            if elem.name == "img":
-                if src := elem.attrs.get("src", None):
-                    url = urljoin(self._base_url, cast(str, src))
-                    if not url.startswith(self._base_url):
-                        continue
-                    log.explain(f"Internalizing {url!r}")
-                    img = await self._get_authenticated(url)
-                    elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
+            if elem.name == "img" and (src := elem.attrs.get("src", None)):
+                url = urljoin(self._base_url, cast(str, src))
+                if not url.startswith(self._base_url):
+                    continue
+                log.explain(f"Internalizing {url!r}")
+                img = await self._get_authenticated(url)
+                elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
            if elem.name == "iframe" and cast(str, elem.attrs.get("src", "")).startswith("//"):
                # For unknown reasons the protocol seems to be stripped.
                elem.attrs["src"] = "https:" + cast(str, elem.attrs["src"])
@@ -1025,7 +1014,7 @@ instance's greatest bottleneck.
            )
        return soup

-    async def _post(self, url: str, data: dict[str, Union[str, List[str]]]) -> bytes:
+    async def _post(self, url: str, data: dict[str, str | list[str]]) -> bytes:
        form_data = aiohttp.FormData()
        for key, val in data.items():
            form_data.add_field(key, val)
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -1,9 +1,10 @@
 import json
 import re
+from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import date, datetime, timedelta
 from enum import Enum
-from typing import Callable, Dict, Optional, Union, cast
+from typing import Optional, cast
 from urllib.parse import urljoin, urlparse

 from bs4 import BeautifulSoup, Tag
@@ -13,7 +14,7 @@ from PFERD.crawl.crawler import CrawlWarning
 from PFERD.logging import log
 from PFERD.utils import url_set_query_params

-TargetType = Union[str, int]
+TargetType = str | int


 class TypeMatcher:
@@ -308,7 +309,7 @@ class IliasPageElement:
        """

        # This checks whether we can reach a `:` without passing a `-`
-        if re.search(r"^[^-]+: ", meeting_name):
+        if re.search(r"^[^-]+: ", meeting_name):  # noqa: SIM108
            # Meeting name only contains date: "05. Jan 2000:"
            split_delimiter = ":"
        else:
@@ -331,7 +332,7 @@ class IliasPageElement:
@dataclass
 class IliasDownloadForumData:
    url: str
-    form_data: Dict[str, Union[str, list[str]]]
+    form_data: dict[str, str | list[str]]
    empty: bool


@@ -433,21 +434,20 @@ class IliasPage:
        for p in paragraphs:
            if p.find_parent(class_=is_interesting_class):
                continue
-            if "ilc_media_cont_MediaContainer" in p["class"]:
+            if "ilc_media_cont_MediaContainer" in p["class"] and (video := p.select_one("video")):
                # We have an embedded video which should be downloaded by _find_mob_videos
-                if video := p.select_one("video"):
-                    url, title = self._find_mob_video_url_title(video, p)
-                    raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
-                    raw_html += "display: flex; justify-content: center; align-items: center;"
-                    raw_html += ' margin: 0.5rem;">'
-                    if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
-                        if url.startswith("//"):
-                            url = "https:" + url
-                        raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
-                    else:
-                        raw_html += f"Video elided. Filename: '{title}'."
-                    raw_html += "</div>\n"
-                    continue
+                url, title = self._find_mob_video_url_title(video, p)
+                raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
+                raw_html += "display: flex; justify-content: center; align-items: center;"
+                raw_html += ' margin: 0.5rem;">'
+                if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
+                    if url.startswith("//"):
+                        url = "https:" + url
+                    raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
+                else:
+                    raw_html += f"Video elided. Filename: '{title}'."
+                raw_html += "</div>\n"
+                continue

            # Ignore special listings (like folder groupings)
            if "ilc_section_Special" in p["class"]:
@@ -794,7 +794,7 @@ class IliasPage:

        is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None

-        if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER:
+        if is_paginated and self._page_type != IliasElementType.OPENCAST_VIDEO_FOLDER:
            # We are in stage 2 - try to break pagination
            return self._find_opencast_video_entries_paginated()

@@ -1164,6 +1164,9 @@ class IliasPage:
        """
        found_titles = []

+        if None == "hey":
+            pass
+
        outer_accordion_content: Optional[Tag] = None

        parents: list[Tag] = list(tag.parents)
@@ -1302,10 +1305,7 @@ class IliasPage:
                ),
            )
            caption_container = caption_parent.find_next_sibling("div")
-            if caption_container:
-                description = caption_container.get_text().strip()
-            else:
-                description = None
+            description = caption_container.get_text().strip() if caption_container else None

            if not typ:
                _unexpected_html_warning()
@@ -1444,9 +1444,7 @@ class IliasPage:
            return True
        # The individual video player wrapper page has nothing of the above.
        # Match it by its playerContainer.
-        if soup.select_one("#playerContainer") is not None:
-            return True
-        return False
+        return soup.select_one("#playerContainer") is not None

    @staticmethod
    def _find_date_in_text(text: str) -> Optional[datetime]:
@@ -1505,11 +1503,11 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti
        # Normalize whitespace because users
        date_str = re.sub(r"\s+", " ", date_str)

-        date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
-        date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
-        date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
+        date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, flags=re.I)
+        date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, flags=re.I)
+        date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, flags=re.I)
        date_str = date_str.strip()
-        for german, english in zip(german_months, english_months):
+        for german, english in zip(german_months, english_months, strict=True):
            date_str = date_str.replace(german, english)
            # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020"
            date_str = date_str.replace(english + ".", english)
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -1,4 +1,4 @@
-from typing import Dict, Literal
+from typing import Literal

 from ...auth import Authenticator
 from ...config import Config
@@ -26,7 +26,7 @@ class KitIliasWebCrawler(IliasWebCrawler):
        name: str,
        section: KitIliasWebCrawlerSection,
        config: Config,
-        authenticators: Dict[str, Authenticator],
+        authenticators: dict[str, Authenticator],
    ):
        super().__init__(name, section, config, authenticators)