Reformat and switch to ruff

This commit is contained in:
I-Al-Istannen
2025-10-19 15:19:43 +02:00
parent ee4625be78
commit 2cf0e060ed
31 changed files with 1507 additions and 587 deletions

View File

@@ -21,8 +21,16 @@ from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat
from .file_templates import LinkData, Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
from .kit_ilias_html import (
IliasElementType,
IliasForumThread,
IliasLearningModulePage,
IliasPage,
IliasPageElement,
IliasSoup,
_sanitize_path_name,
parse_ilias_forum_export,
)
from .shibboleth_login import ShibbolethLogin
TargetType = Union[str, int]
@@ -55,9 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
def tfa_auth(
self, authenticators: Dict[str, Authenticator]
) -> Optional[Authenticator]:
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
value: Optional[str] = self.s.get("tfa_auth")
if value is None:
return None
@@ -166,17 +172,19 @@ class IliasWebCrawler(HttpCrawler):
name: str,
section: IliasWebCrawlerSection,
config: Config,
authenticators: Dict[str, Authenticator]
authenticators: Dict[str, Authenticator],
):
# Setting a main authenticator for cookie sharing
auth = section.auth(authenticators)
super().__init__(name, section, config, shared_auth=auth)
if section.tasks() > 1:
log.warn("""
log.warn(
"""
Please avoid using too many parallel requests as these are the KIT ILIAS
instance's greatest bottleneck.
""".strip())
""".strip()
)
self._auth = auth
self._base_url = section.base_url()
@@ -210,22 +218,19 @@ instance's greatest bottleneck.
# Start crawling at the given course
root_url = url_set_query_param(
urljoin(self._base_url + "/", "goto.php"),
"target", f"crs_{course_id}",
"target",
f"crs_{course_id}",
)
await self._crawl_url(root_url, expected_id=course_id)
async def _crawl_desktop(self) -> None:
await self._crawl_url(
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"),
crawl_nested_courses=True
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"), crawl_nested_courses=True
)
async def _crawl_url(
self,
url: str,
expected_id: Optional[int] = None,
crawl_nested_courses: bool = False
self, url: str, expected_id: Optional[int] = None, crawl_nested_courses: bool = False
) -> None:
if awaitable := await self._handle_ilias_page(
url, None, PurePath("."), expected_id, crawl_nested_courses
@@ -238,7 +243,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement],
path: PurePath,
expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False
crawl_nested_courses: bool = False,
) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path)
if not maybe_cl:
@@ -319,10 +324,7 @@ instance's greatest bottleneck.
# works correctly.
@anoncritical
async def _handle_ilias_element(
self,
parent_path: PurePath,
element: IliasPageElement,
crawl_nested_courses: bool = False
self, parent_path: PurePath, element: IliasPageElement, crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]:
# element.name might contain `/` if the crawler created nested elements,
# so we can not sanitize it here. We trust in the output dir to thwart worst-case
@@ -344,7 +346,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](enable with option 'videos')"
"[bright_black](enable with option 'videos')",
)
return None
@@ -356,7 +358,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](enable with option 'forums')"
"[bright_black](enable with option 'forums')",
)
return None
return await self._handle_forum(element, element_path)
@@ -365,7 +367,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](tests contain no relevant data)"
"[bright_black](tests contain no relevant data)",
)
return None
elif element.type == IliasElementType.SURVEY:
@@ -373,7 +375,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](surveys contain no relevant data)"
"[bright_black](surveys contain no relevant data)",
)
return None
elif element.type == IliasElementType.SCORM_LEARNING_MODULE:
@@ -381,7 +383,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](scorm learning modules are not supported)"
"[bright_black](scorm learning modules are not supported)",
)
return None
elif element.type == IliasElementType.LITERATURE_LIST:
@@ -389,7 +391,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](literature lists are not currently supported)"
"[bright_black](literature lists are not currently supported)",
)
return None
elif element.type == IliasElementType.LEARNING_MODULE_HTML:
@@ -397,7 +399,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](HTML learning modules are not supported)"
"[bright_black](HTML learning modules are not supported)",
)
return None
elif element.type == IliasElementType.BLOG:
@@ -405,7 +407,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](blogs are not currently supported)"
"[bright_black](blogs are not currently supported)",
)
return None
elif element.type == IliasElementType.DCL_RECORD_LIST:
@@ -413,7 +415,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](dcl record lists are not currently supported)"
"[bright_black](dcl record lists are not currently supported)",
)
return None
elif element.type == IliasElementType.MEDIA_POOL:
@@ -421,7 +423,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](media pools are not currently supported)"
"[bright_black](media pools are not currently supported)",
)
return None
elif element.type == IliasElementType.COURSE:
@@ -431,7 +433,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](not descending into linked course)"
"[bright_black](not descending into linked course)",
)
return None
elif element.type == IliasElementType.WIKI:
@@ -439,7 +441,7 @@ instance's greatest bottleneck.
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](wikis are not currently supported)"
"[bright_black](wikis are not currently supported)",
)
return None
elif element.type == IliasElementType.LEARNING_MODULE:
@@ -513,19 +515,15 @@ instance's greatest bottleneck.
@anoncritical
@_iorepeat(3, "resolving link")
async def _download_link(
self,
link_renderer: Links,
collection_name: str,
links: list[LinkData],
dl: DownloadToken
self, link_renderer: Links, collection_name: str, links: list[LinkData], dl: DownloadToken
) -> None:
async with dl as (bar, sink):
rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links)
sink.file.write(rendered.encode("utf-8"))
sink.done()
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal["none"]]:
async def impl() -> Optional[Union[BeautifulSoup, Literal["none"]]]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
@@ -626,7 +624,7 @@ instance's greatest bottleneck.
if self.prev_report:
self.report.add_custom_value(
_get_video_cache_key(element),
self.prev_report.get_custom_value(_get_video_cache_key(element))
self.prev_report.get_custom_value(_get_video_cache_key(element)),
)
# A video might contain other videos, so let's "crawl" the video first
@@ -698,7 +696,7 @@ instance's greatest bottleneck.
def add_to_report(paths: list[str]) -> None:
self.report.add_custom_value(
_get_video_cache_key(element),
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))},
)
async with dl as (bar, sink):
@@ -752,11 +750,7 @@ instance's greatest bottleneck.
await self._stream_from_url(element, sink, bar, is_video)
async def _stream_from_url(
self,
element: IliasPageElement,
sink: FileSink,
bar: ProgressBar,
is_video: bool
self, element: IliasPageElement, sink: FileSink, bar: ProgressBar, is_video: bool
) -> None:
url = element.url
@@ -831,10 +825,10 @@ instance's greatest bottleneck.
log.warn("Could not extract forum export url")
return
export = await self._post(export_url, {
"format": "html",
"cmd[createExportFile]": ""
})
export = await self._post(
export_url,
{"format": "html", "cmd[createExportFile]": ""},
)
elements = parse_ilias_forum_export(soupify(export))
@@ -848,10 +842,7 @@ instance's greatest bottleneck.
@anoncritical
@_iorepeat(3, "saving forum thread")
async def _download_forum_thread(
self,
parent_path: PurePath,
thread: Union[IliasForumThread, IliasPageElement],
forum_url: str
self, parent_path: PurePath, thread: Union[IliasForumThread, IliasPageElement], forum_url: str
) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime)
@@ -860,10 +851,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink):
rendered = forum_thread_template(
thread.name,
forum_url,
thread.name_tag,
await self.internalize_images(thread.content_tag)
thread.name, forum_url, thread.name_tag, await self.internalize_images(thread.content_tag)
)
sink.file.write(rendered.encode("utf-8"))
sink.done()
@@ -891,13 +879,13 @@ instance's greatest bottleneck.
soup = await self._get_page(element.url)
page = IliasPage(soup, element)
if next := page.get_learning_module_data():
elements.extend(await self._crawl_learning_module_direction(
cl.path, next.previous_url, "left", element
))
elements.extend(
await self._crawl_learning_module_direction(cl.path, next.previous_url, "left", element)
)
elements.append(next)
elements.extend(await self._crawl_learning_module_direction(
cl.path, next.next_url, "right", element
))
elements.extend(
await self._crawl_learning_module_direction(cl.path, next.next_url, "right", element)
)
# Reflect their natural ordering in the file names
for index, lm_element in enumerate(elements):
@@ -907,9 +895,9 @@ instance's greatest bottleneck.
for index, elem in enumerate(elements):
prev_url = elements[index - 1].title if index > 0 else None
next_url = elements[index + 1].title if index < len(elements) - 1 else None
tasks.append(asyncio.create_task(
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
))
tasks.append(
asyncio.create_task(self._download_learning_module_page(cl.path, elem, prev_url, next_url))
)
# And execute them
await self.gather(tasks)
@@ -919,7 +907,7 @@ instance's greatest bottleneck.
path: PurePath,
start_url: Optional[str],
dir: Union[Literal["left"], Literal["right"]],
parent_element: IliasPageElement
parent_element: IliasPageElement,
) -> List[IliasLearningModulePage]:
elements: List[IliasLearningModulePage] = []
@@ -950,7 +938,7 @@ instance's greatest bottleneck.
parent_path: PurePath,
element: IliasLearningModulePage,
prev: Optional[str],
next: Optional[str]
next: Optional[str],
) -> None:
path = parent_path / (_sanitize_path_name(element.title) + ".html")
maybe_dl = await self.download(path)
@@ -1037,11 +1025,7 @@ instance's greatest bottleneck.
)
return soup
async def _post(
self,
url: str,
data: dict[str, Union[str, List[str]]]
) -> bytes:
async def _post(self, url: str, data: dict[str, Union[str, List[str]]]) -> bytes:
form_data = aiohttp.FormData()
for key, val in data.items():
form_data.add_field(key, val)
@@ -1090,8 +1074,8 @@ instance's greatest bottleneck.
username, password = await self._auth.credentials()
login_form_data = aiohttp.FormData()
login_form_data.add_field('login_form/input_3/input_4', username)
login_form_data.add_field('login_form/input_3/input_5', password)
login_form_data.add_field("login_form/input_3/input_4", username)
login_form_data.add_field("login_form/input_3/input_5", password)
# do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request: