mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-10-20 00:32:33 +02:00
Reformat and switch to ruff
This commit is contained in:
@@ -21,8 +21,16 @@ from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||
from .async_helper import _iorepeat
|
||||
from .file_templates import LinkData, Links, forum_thread_template, learning_module_template
|
||||
from .ilias_html_cleaner import clean, insert_base_markup
|
||||
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
|
||||
IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
|
||||
from .kit_ilias_html import (
|
||||
IliasElementType,
|
||||
IliasForumThread,
|
||||
IliasLearningModulePage,
|
||||
IliasPage,
|
||||
IliasPageElement,
|
||||
IliasSoup,
|
||||
_sanitize_path_name,
|
||||
parse_ilias_forum_export,
|
||||
)
|
||||
from .shibboleth_login import ShibbolethLogin
|
||||
|
||||
TargetType = Union[str, int]
|
||||
@@ -55,9 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||
|
||||
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
|
||||
|
||||
def tfa_auth(
|
||||
self, authenticators: Dict[str, Authenticator]
|
||||
) -> Optional[Authenticator]:
|
||||
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||
value: Optional[str] = self.s.get("tfa_auth")
|
||||
if value is None:
|
||||
return None
|
||||
@@ -166,17 +172,19 @@ class IliasWebCrawler(HttpCrawler):
|
||||
name: str,
|
||||
section: IliasWebCrawlerSection,
|
||||
config: Config,
|
||||
authenticators: Dict[str, Authenticator]
|
||||
authenticators: Dict[str, Authenticator],
|
||||
):
|
||||
# Setting a main authenticator for cookie sharing
|
||||
auth = section.auth(authenticators)
|
||||
super().__init__(name, section, config, shared_auth=auth)
|
||||
|
||||
if section.tasks() > 1:
|
||||
log.warn("""
|
||||
log.warn(
|
||||
"""
|
||||
Please avoid using too many parallel requests as these are the KIT ILIAS
|
||||
instance's greatest bottleneck.
|
||||
""".strip())
|
||||
""".strip()
|
||||
)
|
||||
|
||||
self._auth = auth
|
||||
self._base_url = section.base_url()
|
||||
@@ -210,22 +218,19 @@ instance's greatest bottleneck.
|
||||
# Start crawling at the given course
|
||||
root_url = url_set_query_param(
|
||||
urljoin(self._base_url + "/", "goto.php"),
|
||||
"target", f"crs_{course_id}",
|
||||
"target",
|
||||
f"crs_{course_id}",
|
||||
)
|
||||
|
||||
await self._crawl_url(root_url, expected_id=course_id)
|
||||
|
||||
async def _crawl_desktop(self) -> None:
|
||||
await self._crawl_url(
|
||||
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"),
|
||||
crawl_nested_courses=True
|
||||
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"), crawl_nested_courses=True
|
||||
)
|
||||
|
||||
async def _crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
expected_id: Optional[int] = None,
|
||||
crawl_nested_courses: bool = False
|
||||
self, url: str, expected_id: Optional[int] = None, crawl_nested_courses: bool = False
|
||||
) -> None:
|
||||
if awaitable := await self._handle_ilias_page(
|
||||
url, None, PurePath("."), expected_id, crawl_nested_courses
|
||||
@@ -238,7 +243,7 @@ instance's greatest bottleneck.
|
||||
current_element: Optional[IliasPageElement],
|
||||
path: PurePath,
|
||||
expected_course_id: Optional[int] = None,
|
||||
crawl_nested_courses: bool = False
|
||||
crawl_nested_courses: bool = False,
|
||||
) -> Optional[Coroutine[Any, Any, None]]:
|
||||
maybe_cl = await self.crawl(path)
|
||||
if not maybe_cl:
|
||||
@@ -319,10 +324,7 @@ instance's greatest bottleneck.
|
||||
# works correctly.
|
||||
@anoncritical
|
||||
async def _handle_ilias_element(
|
||||
self,
|
||||
parent_path: PurePath,
|
||||
element: IliasPageElement,
|
||||
crawl_nested_courses: bool = False
|
||||
self, parent_path: PurePath, element: IliasPageElement, crawl_nested_courses: bool = False
|
||||
) -> Optional[Coroutine[Any, Any, None]]:
|
||||
# element.name might contain `/` if the crawler created nested elements,
|
||||
# so we can not sanitize it here. We trust in the output dir to thwart worst-case
|
||||
@@ -344,7 +346,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](enable with option 'videos')"
|
||||
"[bright_black](enable with option 'videos')",
|
||||
)
|
||||
return None
|
||||
|
||||
@@ -356,7 +358,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](enable with option 'forums')"
|
||||
"[bright_black](enable with option 'forums')",
|
||||
)
|
||||
return None
|
||||
return await self._handle_forum(element, element_path)
|
||||
@@ -365,7 +367,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](tests contain no relevant data)"
|
||||
"[bright_black](tests contain no relevant data)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.SURVEY:
|
||||
@@ -373,7 +375,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](surveys contain no relevant data)"
|
||||
"[bright_black](surveys contain no relevant data)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.SCORM_LEARNING_MODULE:
|
||||
@@ -381,7 +383,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](scorm learning modules are not supported)"
|
||||
"[bright_black](scorm learning modules are not supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.LITERATURE_LIST:
|
||||
@@ -389,7 +391,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](literature lists are not currently supported)"
|
||||
"[bright_black](literature lists are not currently supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.LEARNING_MODULE_HTML:
|
||||
@@ -397,7 +399,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](HTML learning modules are not supported)"
|
||||
"[bright_black](HTML learning modules are not supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.BLOG:
|
||||
@@ -405,7 +407,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](blogs are not currently supported)"
|
||||
"[bright_black](blogs are not currently supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.DCL_RECORD_LIST:
|
||||
@@ -413,7 +415,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](dcl record lists are not currently supported)"
|
||||
"[bright_black](dcl record lists are not currently supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.MEDIA_POOL:
|
||||
@@ -421,7 +423,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](media pools are not currently supported)"
|
||||
"[bright_black](media pools are not currently supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.COURSE:
|
||||
@@ -431,7 +433,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](not descending into linked course)"
|
||||
"[bright_black](not descending into linked course)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.WIKI:
|
||||
@@ -439,7 +441,7 @@ instance's greatest bottleneck.
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](wikis are not currently supported)"
|
||||
"[bright_black](wikis are not currently supported)",
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.LEARNING_MODULE:
|
||||
@@ -513,19 +515,15 @@ instance's greatest bottleneck.
|
||||
@anoncritical
|
||||
@_iorepeat(3, "resolving link")
|
||||
async def _download_link(
|
||||
self,
|
||||
link_renderer: Links,
|
||||
collection_name: str,
|
||||
links: list[LinkData],
|
||||
dl: DownloadToken
|
||||
self, link_renderer: Links, collection_name: str, links: list[LinkData], dl: DownloadToken
|
||||
) -> None:
|
||||
async with dl as (bar, sink):
|
||||
rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links)
|
||||
sink.file.write(rendered.encode("utf-8"))
|
||||
sink.done()
|
||||
|
||||
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]:
|
||||
async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]:
|
||||
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal["none"]]:
|
||||
async def impl() -> Optional[Union[BeautifulSoup, Literal["none"]]]:
|
||||
async with self.session.get(export_url, allow_redirects=False) as resp:
|
||||
# No redirect means we were authenticated
|
||||
if hdrs.LOCATION not in resp.headers:
|
||||
@@ -626,7 +624,7 @@ instance's greatest bottleneck.
|
||||
if self.prev_report:
|
||||
self.report.add_custom_value(
|
||||
_get_video_cache_key(element),
|
||||
self.prev_report.get_custom_value(_get_video_cache_key(element))
|
||||
self.prev_report.get_custom_value(_get_video_cache_key(element)),
|
||||
)
|
||||
|
||||
# A video might contain other videos, so let's "crawl" the video first
|
||||
@@ -698,7 +696,7 @@ instance's greatest bottleneck.
|
||||
def add_to_report(paths: list[str]) -> None:
|
||||
self.report.add_custom_value(
|
||||
_get_video_cache_key(element),
|
||||
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
|
||||
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))},
|
||||
)
|
||||
|
||||
async with dl as (bar, sink):
|
||||
@@ -752,11 +750,7 @@ instance's greatest bottleneck.
|
||||
await self._stream_from_url(element, sink, bar, is_video)
|
||||
|
||||
async def _stream_from_url(
|
||||
self,
|
||||
element: IliasPageElement,
|
||||
sink: FileSink,
|
||||
bar: ProgressBar,
|
||||
is_video: bool
|
||||
self, element: IliasPageElement, sink: FileSink, bar: ProgressBar, is_video: bool
|
||||
) -> None:
|
||||
url = element.url
|
||||
|
||||
@@ -831,10 +825,10 @@ instance's greatest bottleneck.
|
||||
log.warn("Could not extract forum export url")
|
||||
return
|
||||
|
||||
export = await self._post(export_url, {
|
||||
"format": "html",
|
||||
"cmd[createExportFile]": ""
|
||||
})
|
||||
export = await self._post(
|
||||
export_url,
|
||||
{"format": "html", "cmd[createExportFile]": ""},
|
||||
)
|
||||
|
||||
elements = parse_ilias_forum_export(soupify(export))
|
||||
|
||||
@@ -848,10 +842,7 @@ instance's greatest bottleneck.
|
||||
@anoncritical
|
||||
@_iorepeat(3, "saving forum thread")
|
||||
async def _download_forum_thread(
|
||||
self,
|
||||
parent_path: PurePath,
|
||||
thread: Union[IliasForumThread, IliasPageElement],
|
||||
forum_url: str
|
||||
self, parent_path: PurePath, thread: Union[IliasForumThread, IliasPageElement], forum_url: str
|
||||
) -> None:
|
||||
path = parent_path / (_sanitize_path_name(thread.name) + ".html")
|
||||
maybe_dl = await self.download(path, mtime=thread.mtime)
|
||||
@@ -860,10 +851,7 @@ instance's greatest bottleneck.
|
||||
|
||||
async with maybe_dl as (bar, sink):
|
||||
rendered = forum_thread_template(
|
||||
thread.name,
|
||||
forum_url,
|
||||
thread.name_tag,
|
||||
await self.internalize_images(thread.content_tag)
|
||||
thread.name, forum_url, thread.name_tag, await self.internalize_images(thread.content_tag)
|
||||
)
|
||||
sink.file.write(rendered.encode("utf-8"))
|
||||
sink.done()
|
||||
@@ -891,13 +879,13 @@ instance's greatest bottleneck.
|
||||
soup = await self._get_page(element.url)
|
||||
page = IliasPage(soup, element)
|
||||
if next := page.get_learning_module_data():
|
||||
elements.extend(await self._crawl_learning_module_direction(
|
||||
cl.path, next.previous_url, "left", element
|
||||
))
|
||||
elements.extend(
|
||||
await self._crawl_learning_module_direction(cl.path, next.previous_url, "left", element)
|
||||
)
|
||||
elements.append(next)
|
||||
elements.extend(await self._crawl_learning_module_direction(
|
||||
cl.path, next.next_url, "right", element
|
||||
))
|
||||
elements.extend(
|
||||
await self._crawl_learning_module_direction(cl.path, next.next_url, "right", element)
|
||||
)
|
||||
|
||||
# Reflect their natural ordering in the file names
|
||||
for index, lm_element in enumerate(elements):
|
||||
@@ -907,9 +895,9 @@ instance's greatest bottleneck.
|
||||
for index, elem in enumerate(elements):
|
||||
prev_url = elements[index - 1].title if index > 0 else None
|
||||
next_url = elements[index + 1].title if index < len(elements) - 1 else None
|
||||
tasks.append(asyncio.create_task(
|
||||
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
|
||||
))
|
||||
tasks.append(
|
||||
asyncio.create_task(self._download_learning_module_page(cl.path, elem, prev_url, next_url))
|
||||
)
|
||||
|
||||
# And execute them
|
||||
await self.gather(tasks)
|
||||
@@ -919,7 +907,7 @@ instance's greatest bottleneck.
|
||||
path: PurePath,
|
||||
start_url: Optional[str],
|
||||
dir: Union[Literal["left"], Literal["right"]],
|
||||
parent_element: IliasPageElement
|
||||
parent_element: IliasPageElement,
|
||||
) -> List[IliasLearningModulePage]:
|
||||
elements: List[IliasLearningModulePage] = []
|
||||
|
||||
@@ -950,7 +938,7 @@ instance's greatest bottleneck.
|
||||
parent_path: PurePath,
|
||||
element: IliasLearningModulePage,
|
||||
prev: Optional[str],
|
||||
next: Optional[str]
|
||||
next: Optional[str],
|
||||
) -> None:
|
||||
path = parent_path / (_sanitize_path_name(element.title) + ".html")
|
||||
maybe_dl = await self.download(path)
|
||||
@@ -1037,11 +1025,7 @@ instance's greatest bottleneck.
|
||||
)
|
||||
return soup
|
||||
|
||||
async def _post(
|
||||
self,
|
||||
url: str,
|
||||
data: dict[str, Union[str, List[str]]]
|
||||
) -> bytes:
|
||||
async def _post(self, url: str, data: dict[str, Union[str, List[str]]]) -> bytes:
|
||||
form_data = aiohttp.FormData()
|
||||
for key, val in data.items():
|
||||
form_data.add_field(key, val)
|
||||
@@ -1090,8 +1074,8 @@ instance's greatest bottleneck.
|
||||
username, password = await self._auth.credentials()
|
||||
|
||||
login_form_data = aiohttp.FormData()
|
||||
login_form_data.add_field('login_form/input_3/input_4', username)
|
||||
login_form_data.add_field('login_form/input_3/input_5', password)
|
||||
login_form_data.add_field("login_form/input_3/input_4", username)
|
||||
login_form_data.add_field("login_form/input_3/input_5", password)
|
||||
|
||||
# do the actual login
|
||||
async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request:
|
||||
|
Reference in New Issue
Block a user