diff --git a/CHANGELOG.md b/CHANGELOG.md index ee55659..6e3925c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy +- support for ILIAS learning modules ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 59123a2..b206461 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,6 +1,10 @@ from enum import Enum from typing import Optional +import bs4 + +from PFERD.utils import soupify + _link_template_plain = "{{link}}" _link_template_fancy = """ @@ -94,6 +98,71 @@ _link_template_internet_shortcut = """ URL={{link}} """.strip() +_learning_module_template = """ + + + + + {{name}} + + + + +{{body}} + + +""" + + +def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: + # Seems to be comments, ignore those. + for elem in body.select(".il-copg-mob-fullscreen-modal"): + elem.decompose() + + nav_template = """ + + """ + if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): + text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + left = f'{text}' + else: + left = "" + + if next and body.select_one(".ilc_page_rnav_RightNavigation"): + text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + right = f'{text}' + else: + right = "" + + if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): + top_nav.replace_with( + soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) + ) + + if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): + bot_nav.replace_with(soupify(nav_template.replace( + "{{left}}", left).replace("{{right}}", right).encode()) + ) + + body = body.prettify() + return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + class Links(Enum): IGNORE = "ignore" diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5952309..5495304 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: dummy.decompose() if len(children) > 1: continue - if type(children[0]) == Comment: + if isinstance(type(children[0]), Comment): dummy.decompose() for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index aed2069..46a8073 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + LEARNING_MODULE = "learning_module" BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" @@ -71,6 +72,14 @@ class IliasForumThread: mtime: Optional[datetime] +@dataclass +class IliasLearningModulePage: + title: str + content: Tag + next_url: Optional[str] + previous_url: Optional[str] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -136,6 +145,34 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: + if not self._is_learning_module_page(): + return None + content = self._soup.select_one("#ilLMPageContent") + title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + return IliasLearningModulePage( + title=title, + content=content, + next_url=self._find_learning_module_next(), + previous_url=self._find_learning_module_prev() + ) + + def _find_learning_module_next(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + + def _find_learning_module_prev(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) if not form: @@ -222,6 +259,12 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _is_learning_module_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=pg_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -812,6 +855,9 @@ class IliasPage: if "cmdClass=ilobjtestgui" in parsed_url.query: return IliasElementType.TEST + if "baseClass=ilLMPresentationGUI" in parsed_url.query: + return IliasElementType.LEARNING_MODULE + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so # try to guess it from the image. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae49edc..f82d684 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,8 +1,11 @@ import asyncio +import base64 +import os import re from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Callable, Dict, List, Optional, Set, Union, cast +from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast +from urllib.parse import urljoin import aiohttp import yarl @@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links +from .file_templates import Links, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, - _sanitize_path_name, parse_ilias_forum_export) +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -394,6 +397,8 @@ instance's greatest bottleneck. "[bright_black](surveys contain no relevant data)" ) return None + elif element.type == IliasElementType.LEARNING_MODULE: + return await self._handle_learning_module(element, element_path) elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) elif element.type == IliasElementType.BOOKING: @@ -739,6 +744,135 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() + async def _handle_learning_module( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_learning_module(element, maybe_cl) + + @_iorepeat(3, "crawling learning module") + @anoncritical + async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements: List[IliasLearningModulePage] = [] + + async with cl: + log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {element.url}") + soup = await self._get_page(element.url) + page = IliasPage(soup, element.url, None) + if next := page.get_learning_module_data(): + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.previous_url, "left" + )) + elements.append(next) + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.next_url, "right" + )) + + # Reflect their natural ordering in the file names + for index, lm_element in enumerate(elements): + lm_element.title = f"{index:02}_{lm_element.title}" + + tasks: List[Awaitable[None]] = [] + for index, elem in enumerate(elements): + prev_url = elements[index - 1].title if index > 0 else None + next_url = elements[index + 1].title if index < len(elements) - 1 else None + tasks.append(asyncio.create_task( + self._download_learning_module_page(cl.path, elem, prev_url, next_url) + )) + + # And execute them + await self.gather(tasks) + + async def _crawl_learning_module_direction( + self, + path: PurePath, + start_url: Optional[str], + dir: Union[Literal["left"], Literal["right"]] + ) -> List[IliasLearningModulePage]: + elements: List[IliasLearningModulePage] = [] + + if not start_url: + return elements + + next_element_url: Optional[str] = start_url + counter = 0 + while next_element_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") + log.explain(f"URL: {next_element_url}") + soup = await self._get_page(next_element_url) + page = IliasPage(soup, next_element_url, None) + if next := page.get_learning_module_data(): + elements.append(next) + if dir == "left": + next_element_url = next.previous_url + else: + next_element_url = next.next_url + counter += 1 + + return elements + + @anoncritical + @_iorepeat(3, "saving learning module page") + async def _download_learning_module_page( + self, + parent_path: PurePath, + element: IliasLearningModulePage, + prev: Optional[str], + next: Optional[str] + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path) + if not maybe_dl: + return + my_path = self._transformer.transform(maybe_dl.path) + if not my_path: + return + + if prev: + prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) + if prev_p: + prev = os.path.relpath(prev_p, my_path.parent) + else: + prev = None + if next: + next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) + if next_p: + next = os.path.relpath(next_p, my_path.parent) + else: + next = None + + async with maybe_dl as (bar, sink): + content = element.content + content = await self.internalize_images(content) + sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) + sink.done() + + async def internalize_images(self, tag: Tag) -> Tag: + """ + Tries to fetch ILIAS images and embed them as base64 data. + """ + log.explain_topic("Internalizing images") + for elem in tag.find_all(recursive=True): + if not isinstance(elem, Tag): + continue + if elem.name == "img": + if src := elem.attrs.get("src", None): + url = urljoin(_ILIAS_URL, src) + if not url.startswith(_ILIAS_URL): + continue + log.explain(f"Internalizing {url!r}") + img = await self._get_authenticated(url) + elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() + if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + # For unknown reasons the protocol seems to be stripped. + elem.attrs["src"] = "https:" + elem.attrs["src"] + return tag + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -772,7 +906,7 @@ instance's greatest bottleneck. self, url: str, data: dict[str, Union[str, List[str]]] - ) -> BeautifulSoup: + ) -> bytes: auth_id = await self._current_auth_id() form_data = aiohttp.FormData() @@ -792,6 +926,22 @@ instance's greatest bottleneck. return await request.read() raise CrawlError("post_authenticated failed even after authenticating") + async def _get_authenticated(self, url: str) -> bytes: + auth_id = await self._current_auth_id() + + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("get_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. @ _iorepeat(3, "Login", failure_is_error=True)