diff --git a/CHANGELOG.md b/CHANGELOG.md index de29b58..f9bf6d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,12 @@ ambiguous situations. ## Unreleased +## Added +- Support for link collections. + In "fancy" mode, a single HTML file with multiple links is generated. + In all other modes, PFERD creates a folder for the collection and a new file + for every link inside. + ## Fixed - Crawling of exercises with instructions diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index ae8bb1e..f959917 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,3 +1,5 @@ +import dataclasses +import re from enum import Enum from typing import Optional, cast @@ -12,7 +14,9 @@ _link_template_fancy = """ ILIAS - Link: {{name}} + + -
- -
-
- {{name}} +
+ +
+ -
{{description}}
+
+
+ {{name}} +
+
{{description}}
+
+
- +
@@ -255,6 +270,13 @@ def forum_thread_template(name: str, url: str, heading: bs4.Tag, content: bs4.Ta .replace("{{content}}", cast(str, content.prettify())) +@dataclasses.dataclass +class LinkData: + name: str + url: str + description: str + + class Links(Enum): IGNORE = "ignore" PLAINTEXT = "plaintext" @@ -272,6 +294,11 @@ class Links(Enum): return None raise ValueError("Missing switch case") + def collection_as_one(self) -> bool: + if self == Links.FANCY: + return True + return False + def extension(self) -> Optional[str]: if self == Links.FANCY: return ".html" @@ -283,10 +310,48 @@ class Links(Enum): return None raise ValueError("Missing switch case") + def interpolate(self, redirect_delay: int, collection_name: str, links: list[LinkData]) -> str: + template = self.template() + if template is None: + raise ValueError("Cannot interpolate ignored links") + + if len(links) == 1: + link = links[0] + content = template + content = content.replace("{{link}}", link.url) + content = content.replace("{{name}}", link.name) + content = content.replace("{{description}}", link.description) + content = content.replace("{{redirect_delay}}", str(redirect_delay)) + return content + if self == Links.PLAINTEXT or self == Links.INTERNET_SHORTCUT: + return "\n".join(f"{link.url}" for link in links) + + # All others get coerced to fancy + content = cast(str, Links.FANCY.template()) + repeated_content = cast( + re.Match[str], + re.search(r"([\s\S]+)", content) + ).group(1) + + parts = [] + for link in links: + instance = repeated_content + instance = instance.replace("{{link}}", link.url) + instance = instance.replace("{{name}}", link.name) + instance = instance.replace("{{description}}", link.description) + instance = instance.replace("{{redirect_delay}}", str(redirect_delay)) + parts.append(instance) + + content = content.replace(repeated_content, "\n".join(parts)) + content = content.replace("{{name}}", collection_name) + content = re.sub(r"[\s\S]+", "", content) + + return content + @staticmethod def from_string(string: str) -> "Links": try: return Links(string) except ValueError: - raise ValueError("must be one of 'ignore', 'plaintext'," - " 'html', 'internet-shortcut'") + options = [f"'{option.value}'" for option in Links] + raise ValueError(f"must be one of {', '.join(options)}") diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 3b78e5d..b682c0a 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -19,7 +19,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .async_helper import _iorepeat -from .file_templates import Links, forum_thread_template, learning_module_template +from .file_templates import LinkData, Links, forum_thread_template, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export) @@ -437,6 +437,8 @@ instance's greatest bottleneck. return await self._handle_learning_module(element, element_path) elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) + elif element.type == IliasElementType.LINK_COLLECTION: + return await self._handle_link(element, element_path) elif element.type == IliasElementType.BOOKING: return await self._handle_booking(element, element_path) elif element.type == IliasElementType.OPENCAST_VIDEO: @@ -462,44 +464,97 @@ instance's greatest bottleneck. log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") - link_template_maybe = self._links.template() - link_extension = self._links.extension() - if not link_template_maybe or not link_extension: + export_url = url_set_query_param(element.url, "cmd", "exportHTML") + resolved = await self._resolve_link_target(export_url) + if resolved == "none": + links = [LinkData(element.name, "", element.description or "")] + else: + links = self._parse_link_content(element, cast(BeautifulSoup, resolved)) + + maybe_extension = self._links.extension() + + if not maybe_extension: log.explain("Answer: No") return None else: log.explain("Answer: Yes") - element_path = element_path.with_name(element_path.name + link_extension) - maybe_dl = await self.download(element_path, mtime=element.mtime) - if not maybe_dl: + if len(links) <= 1 or self._links.collection_as_one(): + element_path = element_path.with_name(element_path.name + maybe_extension) + maybe_dl = await self.download(element_path, mtime=element.mtime) + if not maybe_dl: + return None + return self._download_link(self._links, element.name, links, maybe_dl) + + maybe_cl = await self.crawl(element_path) + if not maybe_cl: return None + # Required for download_all closure + cl = maybe_cl + extension = maybe_extension - return self._download_link(element, link_template_maybe, maybe_dl) + async def download_all() -> None: + for link in links: + path = cl.path / (_sanitize_path_name(link.name) + extension) + if dl := await self.download(path, mtime=element.mtime): + await self._download_link(self._links, element.name, [link], dl) + + return download_all() @anoncritical @_iorepeat(3, "resolving link") - async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: - async with dl as (bar, sink): - export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") - real_url = await self._resolve_link_target(export_url) - self._write_link_content(link_template, real_url, element.name, element.description, sink) - - def _write_link_content( + async def _download_link( self, - link_template: str, - url: str, - name: str, - description: Optional[str], - sink: FileSink, + link_renderer: Links, + collection_name: str, + links: list[LinkData], + dl: DownloadToken ) -> None: - content = link_template - content = content.replace("{{link}}", url) - content = content.replace("{{name}}", name) - content = content.replace("{{description}}", str(description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + async with dl as (bar, sink): + rendered = link_renderer.interpolate(self._link_file_redirect_delay, collection_name, links) + sink.file.write(rendered.encode("utf-8")) + sink.done() + + async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal['none']]: + async def impl() -> Optional[Union[BeautifulSoup, Literal['none']]]: + async with self.session.get(export_url, allow_redirects=False) as resp: + # No redirect means we were authenticated + if hdrs.LOCATION not in resp.headers: + return soupify(await resp.read()) # .select_one("a").get("href").strip() # type: ignore + # We are either unauthenticated or the link is not active + new_url = resp.headers[hdrs.LOCATION].lower() + if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url: + return "none" + return None + + auth_id = await self._current_auth_id() + target = await impl() + if target is not None: + return target + + await self.authenticate(auth_id) + + target = await impl() + if target is not None: + return target + + raise CrawlError("resolve_link_target failed even after authenticating") + + @staticmethod + def _parse_link_content(element: IliasPageElement, content: BeautifulSoup) -> list[LinkData]: + links = cast(list[Tag], list(content.select("a"))) + if len(links) == 1: + url = str(links[0].get("href")).strip() + return [LinkData(name=element.name, description=element.description or "", url=url)] + + results = [] + for link in links: + url = str(link.get("href")).strip() + name = link.get_text(strip=True) + description = cast(Tag, link.find_next_sibling("dd")).get_text(strip=True) + results.append(LinkData(name=name, description=description, url=url.strip())) + + return results async def _handle_booking( self, @@ -524,7 +579,7 @@ instance's greatest bottleneck. self._ensure_not_seen(element, element_path) - return self._download_booking(element, link_template_maybe, maybe_dl) + return self._download_booking(element, maybe_dl) @anoncritical @_iorepeat(1, "downloading description") @@ -545,36 +600,13 @@ instance's greatest bottleneck. async def _download_booking( self, element: IliasPageElement, - link_template: str, dl: DownloadToken, ) -> None: async with dl as (bar, sink): - self._write_link_content(link_template, element.url, element.name, element.description, sink) - - async def _resolve_link_target(self, export_url: str) -> str: - async def impl() -> Optional[str]: - async with self.session.get(export_url, allow_redirects=False) as resp: - # No redirect means we were authenticated - if hdrs.LOCATION not in resp.headers: - return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore - # We are either unauthenticated or the link is not active - new_url = resp.headers[hdrs.LOCATION].lower() - if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url: - return "" - return None - - auth_id = await self._current_auth_id() - target = await impl() - if target is not None: - return target - - await self.authenticate(auth_id) - - target = await impl() - if target is not None: - return target - - raise CrawlError("resolve_link_target failed even after authenticating") + links = [LinkData(name=element.name, description=element.description or "", url=element.url)] + rendered = self._links.interpolate(self._link_file_redirect_delay, element.name, links) + sink.file.write(rendered.encode("utf-8")) + sink.done() async def _handle_opencast_video( self, diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 105c606..70ec3d7 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -109,6 +109,7 @@ class IliasElementType(Enum): LEARNING_MODULE_HTML = "learning_module_html" LITERATURE_LIST = "literature_list" LINK = "link" + LINK_COLLECTION = "link_collection" MEDIA_POOL = "media_pool" MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" @@ -202,7 +203,12 @@ class IliasElementType(Enum): TypeMatcher.query("baseclass=illinkresourcehandlergui"), TypeMatcher.query("calldirectlink"), ), - TypeMatcher.img_src("_webr.svg") + TypeMatcher.img_src("_webr.svg") # duplicated :( + ) + case IliasElementType.LINK_COLLECTION: + return TypeMatcher.any( + TypeMatcher.query("baseclass=illinkresourcehandlergui"), + TypeMatcher.img_src("_webr.svg") # duplicated :( ) case IliasElementType.MEDIA_POOL: return TypeMatcher.any(