Download page descriptions

2025-07-19 01:12:38 +02:00 · 2022-05-11 21:16:09 +02:00
parent a5015fe9b1
commit 846c29aee1
4 changed files with 148 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,9 @@ ambiguous situations.

 ## Unreleased

+### Added
+- Download of page descriptions
+
 ### Changed
 - Add `cpp` extension to default `link_regex` of IPD crawler
 - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option
--- a/PFERD/crawl/ilias/ilias_html_cleaner.py
+++ b/PFERD/crawl/ilias/ilias_html_cleaner.py
@@ -0,0 +1,91 @@
+from bs4 import BeautifulSoup, Comment, Tag
+
+_STYLE_TAG_CONTENT = """
+    .ilc_text_block_Information {
+      background-color: #f5f7fa;
+    }
+    div.ilc_text_block_Standard {
+      margin-bottom: 10px;
+      margin-top: 10px;
+    }
+    span.ilc_text_inline_Strong {
+      font-weight: bold;
+    }
+
+    .accordion-head {
+      background-color: #f5f7fa;
+      padding: 0.5rem 0;
+    }
+
+    h3 {
+      margin-top: 0.5rem;
+      margin-bottom: 1rem;
+    }
+
+    br.visible-break {
+      margin-bottom: 1rem;
+    }
+
+    article {
+      margin: 0.5rem 0;
+    }
+
+    body {
+      padding: 1em;
+      grid-template-columns: 1fr min(60rem, 90%) 1fr;
+      line-height: 1.2;
+    }
+"""
+
+_ARTICLE_WORTHY_CLASSES = [
+    "ilc_text_block_Information",
+    "ilc_section_Attention",
+    "ilc_section_Link",
+]
+
+
+def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
+    head = soup.new_tag("head")
+    soup.insert(0, head)
+
+    simplecss_link: Tag = soup.new_tag("link")
+    # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
+    simplecss_link["rel"] = "stylesheet"
+    simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
+    head.append(simplecss_link)
+
+    # Basic style tags for compat
+    style: Tag = soup.new_tag("style")
+    style.append(_STYLE_TAG_CONTENT)
+    head.append(style)
+
+    return soup
+
+
+def clean(soup: BeautifulSoup) -> BeautifulSoup:
+    for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES):
+        block.name = "article"
+
+    for block in soup.find_all("h3"):
+        block.name = "div"
+
+    for block in soup.find_all("h1"):
+        block.name = "h3"
+
+    for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"):
+        block.name = "h3"
+        block["class"] += ["accordion-head"]
+
+    for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"):
+        children = list(dummy.children)
+        if not children:
+            dummy.decompose()
+        if len(children) > 1:
+            continue
+        if type(children[0]) == Comment:
+            dummy.decompose()
+
+    for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
+        hrule_imposter.insert(0, soup.new_tag("hr"))
+
+    return soup
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -85,6 +85,31 @@ class IliasPage:
        log.explain("Page is a normal folder, searching for elements")
        return self._find_normal_entries()

+    def get_description(self) -> Optional[BeautifulSoup]:
+        def is_interesting_class(name: str) -> bool:
+            return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
+
+        paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class)
+        if not paragraphs:
+            return None
+
+        # Extract bits and pieces into a string and parse it again.
+        # This ensures we don't miss anything and weird structures are resolved
+        # somewhat gracefully.
+        raw_html = ""
+        for p in paragraphs:
+            if p.find_parent(class_=is_interesting_class):
+                continue
+
+            # Ignore special listings (like folder groupings)
+            if "ilc_section_Special" in p["class"]:
+                continue
+
+            raw_html += str(p) + "\n"
+        raw_html = f"<body>\n{raw_html}\n</body>"
+
+        return BeautifulSoup(raw_html, "html.parser")
+
    def get_next_stage_element(self) -> Optional[IliasPageElement]:
        if self._is_ilias_opencast_embedding():
            return self.get_child_elements()[0]
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import Links
+from .ilias_html_cleaner import clean, insert_base_markup
 from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement

 TargetType = Union[str, int]
@@ -215,6 +216,8 @@ instance's greatest bottleneck.
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608

        elements: List[IliasPageElement] = []
+        # A list as variable redefinitions are not propagated to outer scopes
+        description: List[BeautifulSoup] = []

        @_iorepeat(3, "crawling url")
        async def gather_elements() -> None:
@@ -233,9 +236,15 @@ instance's greatest bottleneck.
                page = IliasPage(soup, url, None)
                elements.extend(page.get_child_elements())

+                if description_string := page.get_description():
+                    description.append(description_string)
+
        # Fill up our task list with the found elements
        await gather_elements()

+        if description:
+            await self._download_description(PurePath("."), description[0])
+
        elements.sort(key=lambda e: e.id())

        tasks: List[Awaitable[None]] = []
@@ -265,6 +274,8 @@ instance's greatest bottleneck.
        cl: CrawlToken,
    ) -> None:
        elements: List[IliasPageElement] = []
+        # A list as variable redefinitions are not propagated to outer scopes
+        description: List[BeautifulSoup] = []

        @_iorepeat(3, "crawling folder")
        async def gather_elements() -> None:
@@ -285,10 +296,15 @@ instance's greatest bottleneck.
                        next_stage_url = None

                elements.extend(page.get_child_elements())
+                if description_string := page.get_description():
+                    description.append(description_string)

        # Fill up our task list with the found elements
        await gather_elements()

+        if description:
+            await self._download_description(PurePath("."), description[0])
+
        elements.sort(key=lambda e: e.id())

        tasks: List[Awaitable[None]] = []
@@ -425,6 +441,19 @@ instance's greatest bottleneck.

        return self._download_booking(element, link_template_maybe, maybe_dl)

+    @anoncritical
+    @_iorepeat(1, "downloading description")
+    async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None:
+        path = parent_path / "Description.html"
+        dl = await self.download(path, redownload=Redownload.ALWAYS)
+        if not dl:
+            return
+
+        async with dl as (bar, sink):
+            description = clean(insert_base_markup(description))
+            sink.file.write(description.prettify().encode("utf-8"))
+            sink.done()
+
    @anoncritical
    @_iorepeat(3, "resolving booking")
    async def _download_booking(