Fix ruff errors

This commit is contained in:
I-Al-Istannen
2025-10-19 15:25:40 +02:00
parent 2cf0e060ed
commit 6e563134b2
26 changed files with 194 additions and 209 deletions

View File

@@ -1,5 +1,6 @@
import asyncio
from typing import Any, Callable, Optional
from collections.abc import Callable
from typing import Any, Optional
import aiohttp
@@ -15,9 +16,9 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
try:
return await f(*args, **kwargs)
except aiohttp.ContentTypeError: # invalid content type
raise CrawlWarning("ILIAS returned an invalid content type")
raise CrawlWarning("ILIAS returned an invalid content type") from None
except aiohttp.TooManyRedirects:
raise CrawlWarning("Got stuck in a redirect loop")
raise CrawlWarning("Got stuck in a redirect loop") from None
except aiohttp.ClientPayloadError as e: # encoding or not enough bytes
last_exception = e
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.

View File

@@ -297,9 +297,7 @@ class Links(Enum):
raise ValueError("Missing switch case")
def collection_as_one(self) -> bool:
if self == Links.FANCY:
return True
return False
return self == Links.FANCY
def extension(self) -> Optional[str]:
if self == Links.FANCY:
@@ -355,4 +353,4 @@ class Links(Enum):
return Links(string)
except ValueError:
options = [f"'{option.value}'" for option in Links]
raise ValueError(f"must be one of {', '.join(options)}")
raise ValueError(f"must be one of {', '.join(options)}") from None

View File

@@ -4,7 +4,7 @@ import os
import re
from collections.abc import Awaitable, Coroutine
from pathlib import PurePath
from typing import Any, Dict, List, Literal, Optional, Set, Union, cast
from typing import Any, Literal, Optional, cast
from urllib.parse import urljoin
import aiohttp
@@ -33,7 +33,7 @@ from .kit_ilias_html import (
)
from .shibboleth_login import ShibbolethLogin
TargetType = Union[str, int]
TargetType = str | int
class LoginTypeLocal:
@@ -49,7 +49,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
return base_url
def login(self) -> Union[Literal["shibboleth"], LoginTypeLocal]:
def login(self) -> Literal["shibboleth"] | LoginTypeLocal:
login_type = self.s.get("login_type")
if not login_type:
self.missing_value("login_type")
@@ -63,7 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
def tfa_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
value: Optional[str] = self.s.get("tfa_auth")
if value is None:
return None
@@ -110,7 +110,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
return self.s.getboolean("forums", fallback=False)
_DIRECTORY_PAGES: Set[IliasElementType] = {
_DIRECTORY_PAGES: set[IliasElementType] = {
IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES,
IliasElementType.EXERCISE_OVERVIEW,
@@ -122,7 +122,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED,
}
_VIDEO_ELEMENTS: Set[IliasElementType] = {
_VIDEO_ELEMENTS: set[IliasElementType] = {
IliasElementType.MEDIACAST_VIDEO,
IliasElementType.MEDIACAST_VIDEO_FOLDER,
IliasElementType.OPENCAST_VIDEO,
@@ -172,7 +172,7 @@ class IliasWebCrawler(HttpCrawler):
name: str,
section: IliasWebCrawlerSection,
config: Config,
authenticators: Dict[str, Authenticator],
authenticators: dict[str, Authenticator],
):
# Setting a main authenticator for cookie sharing
auth = section.auth(authenticators)
@@ -201,7 +201,7 @@ instance's greatest bottleneck.
self._links = section.links()
self._videos = section.videos()
self._forums = section.forums()
self._visited_urls: Dict[str, PurePath] = dict()
self._visited_urls: dict[str, PurePath] = dict()
async def _run(self) -> None:
if isinstance(self._target, int):
@@ -264,9 +264,9 @@ instance's greatest bottleneck.
expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False,
) -> None:
elements: List[IliasPageElement] = []
elements: list[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes
description: List[BeautifulSoup] = []
description: list[BeautifulSoup] = []
@_iorepeat(3, "crawling folder")
async def gather_elements() -> None:
@@ -309,7 +309,7 @@ instance's greatest bottleneck.
elements.sort(key=lambda e: e.id())
tasks: List[Awaitable[None]] = []
tasks: list[Awaitable[None]] = []
for element in elements:
if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses):
tasks.append(asyncio.create_task(handle))
@@ -340,15 +340,14 @@ instance's greatest bottleneck.
)
return None
if element.type in _VIDEO_ELEMENTS:
if not self._videos:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](enable with option 'videos')",
)
return None
if element.type in _VIDEO_ELEMENTS and not self._videos:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](enable with option 'videos')",
)
return None
if element.type == IliasElementType.FILE:
return await self._handle_file(element, element_path)
@@ -522,8 +521,8 @@ instance's greatest bottleneck.
sink.file.write(rendered.encode("utf-8"))
sink.done()
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal["none"]]:
async def impl() -> Optional[Union[BeautifulSoup, Literal["none"]]]:
async def _resolve_link_target(self, export_url: str) -> BeautifulSoup | Literal["none"]:
async def impl() -> Optional[BeautifulSoup | Literal["none"]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
@@ -658,7 +657,7 @@ instance's greatest bottleneck.
def _previous_contained_opencast_videos(
self, element: IliasPageElement, element_path: PurePath
) -> List[PurePath]:
) -> list[PurePath]:
if not self.prev_report:
return []
custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
@@ -714,7 +713,7 @@ instance's greatest bottleneck.
add_to_report([str(self._transformer.transform(dl.path))])
return
contained_video_paths: List[str] = []
contained_video_paths: list[str] = []
for stream_element in stream_elements:
video_path = dl.path.parent / stream_element.name
@@ -832,7 +831,7 @@ instance's greatest bottleneck.
elements = parse_ilias_forum_export(soupify(export))
tasks: List[Awaitable[None]] = []
tasks: list[Awaitable[None]] = []
for thread in elements:
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url)))
@@ -842,7 +841,7 @@ instance's greatest bottleneck.
@anoncritical
@_iorepeat(3, "saving forum thread")
async def _download_forum_thread(
self, parent_path: PurePath, thread: Union[IliasForumThread, IliasPageElement], forum_url: str
self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime)
@@ -871,7 +870,7 @@ instance's greatest bottleneck.
@_iorepeat(3, "crawling learning module")
@anoncritical
async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements: List[IliasLearningModulePage] = []
elements: list[IliasLearningModulePage] = []
async with cl:
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
@@ -891,7 +890,7 @@ instance's greatest bottleneck.
for index, lm_element in enumerate(elements):
lm_element.title = f"{index:02}_{lm_element.title}"
tasks: List[Awaitable[None]] = []
tasks: list[Awaitable[None]] = []
for index, elem in enumerate(elements):
prev_url = elements[index - 1].title if index > 0 else None
next_url = elements[index + 1].title if index < len(elements) - 1 else None
@@ -906,10 +905,10 @@ instance's greatest bottleneck.
self,
path: PurePath,
start_url: Optional[str],
dir: Union[Literal["left"], Literal["right"]],
dir: Literal["left"] | Literal["right"],
parent_element: IliasPageElement,
) -> List[IliasLearningModulePage]:
elements: List[IliasLearningModulePage] = []
) -> list[IliasLearningModulePage]:
elements: list[IliasLearningModulePage] = []
if not start_url:
return elements
@@ -923,10 +922,7 @@ instance's greatest bottleneck.
page = IliasPage(soup, parent_element)
if next := page.get_learning_module_data():
elements.append(next)
if dir == "left":
next_element_url = next.previous_url
else:
next_element_url = next.next_url
next_element_url = next.previous_url if dir == "left" else next.next_url
counter += 1
return elements
@@ -950,16 +946,10 @@ instance's greatest bottleneck.
if prev:
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
if prev_p:
prev = cast(str, os.path.relpath(prev_p, my_path.parent))
else:
prev = None
prev = cast(str, os.path.relpath(prev_p, my_path.parent)) if prev_p else None
if next:
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
if next_p:
next = cast(str, os.path.relpath(next_p, my_path.parent))
else:
next = None
next = cast(str, os.path.relpath(next_p, my_path.parent)) if next_p else None
async with maybe_dl as (bar, sink):
content = element.content
@@ -973,14 +963,13 @@ instance's greatest bottleneck.
"""
log.explain_topic("Internalizing images")
for elem in tag.find_all(recursive=True):
if elem.name == "img":
if src := elem.attrs.get("src", None):
url = urljoin(self._base_url, cast(str, src))
if not url.startswith(self._base_url):
continue
log.explain(f"Internalizing {url!r}")
img = await self._get_authenticated(url)
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
if elem.name == "img" and (src := elem.attrs.get("src", None)):
url = urljoin(self._base_url, cast(str, src))
if not url.startswith(self._base_url):
continue
log.explain(f"Internalizing {url!r}")
img = await self._get_authenticated(url)
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
if elem.name == "iframe" and cast(str, elem.attrs.get("src", "")).startswith("//"):
# For unknown reasons the protocol seems to be stripped.
elem.attrs["src"] = "https:" + cast(str, elem.attrs["src"])
@@ -1025,7 +1014,7 @@ instance's greatest bottleneck.
)
return soup
async def _post(self, url: str, data: dict[str, Union[str, List[str]]]) -> bytes:
async def _post(self, url: str, data: dict[str, str | list[str]]) -> bytes:
form_data = aiohttp.FormData()
for key, val in data.items():
form_data.add_field(key, val)

View File

@@ -1,9 +1,10 @@
import json
import re
from collections.abc import Callable
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from enum import Enum
from typing import Callable, Dict, Optional, Union, cast
from typing import Optional, cast
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag
@@ -13,7 +14,7 @@ from PFERD.crawl.crawler import CrawlWarning
from PFERD.logging import log
from PFERD.utils import url_set_query_params
TargetType = Union[str, int]
TargetType = str | int
class TypeMatcher:
@@ -308,7 +309,7 @@ class IliasPageElement:
"""
# This checks whether we can reach a `:` without passing a `-`
if re.search(r"^[^-]+: ", meeting_name):
if re.search(r"^[^-]+: ", meeting_name): # noqa: SIM108
# Meeting name only contains date: "05. Jan 2000:"
split_delimiter = ":"
else:
@@ -331,7 +332,7 @@ class IliasPageElement:
@dataclass
class IliasDownloadForumData:
url: str
form_data: Dict[str, Union[str, list[str]]]
form_data: dict[str, str | list[str]]
empty: bool
@@ -433,21 +434,20 @@ class IliasPage:
for p in paragraphs:
if p.find_parent(class_=is_interesting_class):
continue
if "ilc_media_cont_MediaContainer" in p["class"]:
if "ilc_media_cont_MediaContainer" in p["class"] and (video := p.select_one("video")):
# We have an embedded video which should be downloaded by _find_mob_videos
if video := p.select_one("video"):
url, title = self._find_mob_video_url_title(video, p)
raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
raw_html += "display: flex; justify-content: center; align-items: center;"
raw_html += ' margin: 0.5rem;">'
if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
if url.startswith("//"):
url = "https:" + url
raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
else:
raw_html += f"Video elided. Filename: '{title}'."
raw_html += "</div>\n"
continue
url, title = self._find_mob_video_url_title(video, p)
raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
raw_html += "display: flex; justify-content: center; align-items: center;"
raw_html += ' margin: 0.5rem;">'
if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
if url.startswith("//"):
url = "https:" + url
raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
else:
raw_html += f"Video elided. Filename: '{title}'."
raw_html += "</div>\n"
continue
# Ignore special listings (like folder groupings)
if "ilc_section_Special" in p["class"]:
@@ -794,7 +794,7 @@ class IliasPage:
is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER:
if is_paginated and self._page_type != IliasElementType.OPENCAST_VIDEO_FOLDER:
# We are in stage 2 - try to break pagination
return self._find_opencast_video_entries_paginated()
@@ -1164,6 +1164,9 @@ class IliasPage:
"""
found_titles = []
if None == "hey":
pass
outer_accordion_content: Optional[Tag] = None
parents: list[Tag] = list(tag.parents)
@@ -1302,10 +1305,7 @@ class IliasPage:
),
)
caption_container = caption_parent.find_next_sibling("div")
if caption_container:
description = caption_container.get_text().strip()
else:
description = None
description = caption_container.get_text().strip() if caption_container else None
if not typ:
_unexpected_html_warning()
@@ -1444,9 +1444,7 @@ class IliasPage:
return True
# The individual video player wrapper page has nothing of the above.
# Match it by its playerContainer.
if soup.select_one("#playerContainer") is not None:
return True
return False
return soup.select_one("#playerContainer") is not None
@staticmethod
def _find_date_in_text(text: str) -> Optional[datetime]:
@@ -1505,11 +1503,11 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti
# Normalize whitespace because users
date_str = re.sub(r"\s+", " ", date_str)
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, flags=re.I)
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, flags=re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, flags=re.I)
date_str = date_str.strip()
for german, english in zip(german_months, english_months):
for german, english in zip(german_months, english_months, strict=True):
date_str = date_str.replace(german, english)
# Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020"
date_str = date_str.replace(english + ".", english)

View File

@@ -1,4 +1,4 @@
from typing import Dict, Literal
from typing import Literal
from ...auth import Authenticator
from ...config import Config
@@ -26,7 +26,7 @@ class KitIliasWebCrawler(IliasWebCrawler):
name: str,
section: KitIliasWebCrawlerSection,
config: Config,
authenticators: Dict[str, Authenticator],
authenticators: dict[str, Authenticator],
):
super().__init__(name, section, config, authenticators)