From 846c29aee1867f7f0b7efae802af47fee77a3ec6 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Wed, 11 May 2022 21:16:09 +0200
Subject: [PATCH] Download page descriptions

---
 CHANGELOG.md                               |  3 +
 PFERD/crawl/ilias/ilias_html_cleaner.py    | 91 ++++++++++++++++++++++
 PFERD/crawl/ilias/kit_ilias_html.py        | 25 ++++++
 PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 +++++++
 4 files changed, 148 insertions(+)
 create mode 100644 PFERD/crawl/ilias/ilias_html_cleaner.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e2d3840..b7cad13 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,9 @@ ambiguous situations.
 
 ## Unreleased
 
+### Added
+- Download of page descriptions
+
 ### Changed
 - Add `cpp` extension to default `link_regex` of IPD crawler
 - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option
diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py
new file mode 100644
index 0000000..5952309
--- /dev/null
+++ b/PFERD/crawl/ilias/ilias_html_cleaner.py
@@ -0,0 +1,91 @@
+from bs4 import BeautifulSoup, Comment, Tag
+
+_STYLE_TAG_CONTENT = """
+    .ilc_text_block_Information {
+      background-color: #f5f7fa;
+    }
+    div.ilc_text_block_Standard {
+      margin-bottom: 10px;
+      margin-top: 10px;
+    }
+    span.ilc_text_inline_Strong {
+      font-weight: bold;
+    }
+
+    .accordion-head {
+      background-color: #f5f7fa;
+      padding: 0.5rem 0;
+    }
+
+    h3 {
+      margin-top: 0.5rem;
+      margin-bottom: 1rem;
+    }
+
+    br.visible-break {
+      margin-bottom: 1rem;
+    }
+
+    article {
+      margin: 0.5rem 0;
+    }
+
+    body {
+      padding: 1em;
+      grid-template-columns: 1fr min(60rem, 90%) 1fr;
+      line-height: 1.2;
+    }
+"""
+
+_ARTICLE_WORTHY_CLASSES = [
+    "ilc_text_block_Information",
+    "ilc_section_Attention",
+    "ilc_section_Link",
+]
+
+
+def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
+    head = soup.new_tag("head")
+    soup.insert(0, head)
+
+    simplecss_link: Tag = soup.new_tag("link")
+    # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
+    simplecss_link["rel"] = "stylesheet"
+    simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
+    head.append(simplecss_link)
+
+    # Basic style tags for compat
+    style: Tag = soup.new_tag("style")
+    style.append(_STYLE_TAG_CONTENT)
+    head.append(style)
+
+    return soup
+
+
+def clean(soup: BeautifulSoup) -> BeautifulSoup:
+    for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES):
+        block.name = "article"
+
+    for block in soup.find_all("h3"):
+        block.name = "div"
+
+    for block in soup.find_all("h1"):
+        block.name = "h3"
+
+    for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"):
+        block.name = "h3"
+        block["class"] += ["accordion-head"]
+
+    for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"):
+        children = list(dummy.children)
+        if not children:
+            dummy.decompose()
+        if len(children) > 1:
+            continue
+        if type(children[0]) == Comment:
+            dummy.decompose()
+
+    for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
+        hrule_imposter.insert(0, soup.new_tag("hr"))
+
+    return soup
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index 6d063b6..d58e5c8 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -85,6 +85,31 @@ class IliasPage:
         log.explain("Page is a normal folder, searching for elements")
         return self._find_normal_entries()
 
+    def get_description(self) -> Optional[BeautifulSoup]:
+        def is_interesting_class(name: str) -> bool:
+            return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
+
+        paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class)
+        if not paragraphs:
+            return None
+
+        # Extract bits and pieces into a string and parse it again.
+        # This ensures we don't miss anything and weird structures are resolved
+        # somewhat gracefully.
+        raw_html = ""
+        for p in paragraphs:
+            if p.find_parent(class_=is_interesting_class):
+                continue
+
+            # Ignore special listings (like folder groupings)
+            if "ilc_section_Special" in p["class"]:
+                continue
+
+            raw_html += str(p) + "\n"
+        raw_html = f"<body>\n{raw_html}\n</body>"
+
+        return BeautifulSoup(raw_html, "html.parser")
+
     def get_next_stage_element(self) -> Optional[IliasPageElement]:
         if self._is_ilias_opencast_embedding():
             return self.get_child_elements()[0]
diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
index ae9ebd4..bbed986 100644
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import Links
+from .ilias_html_cleaner import clean, insert_base_markup
 from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
 
 TargetType = Union[str, int]
@@ -215,6 +216,8 @@ instance's greatest bottleneck.
         cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
 
         elements: List[IliasPageElement] = []
+        # A list as variable redefinitions are not propagated to outer scopes
+        description: List[BeautifulSoup] = []
 
         @_iorepeat(3, "crawling url")
         async def gather_elements() -> None:
@@ -233,9 +236,15 @@ instance's greatest bottleneck.
                 page = IliasPage(soup, url, None)
                 elements.extend(page.get_child_elements())
 
+                if description_string := page.get_description():
+                    description.append(description_string)
+
         # Fill up our task list with the found elements
         await gather_elements()
 
+        if description:
+            await self._download_description(PurePath("."), description[0])
+
         elements.sort(key=lambda e: e.id())
 
         tasks: List[Awaitable[None]] = []
@@ -265,6 +274,8 @@ instance's greatest bottleneck.
         cl: CrawlToken,
     ) -> None:
         elements: List[IliasPageElement] = []
+        # A list as variable redefinitions are not propagated to outer scopes
+        description: List[BeautifulSoup] = []
 
         @_iorepeat(3, "crawling folder")
         async def gather_elements() -> None:
@@ -285,10 +296,15 @@ instance's greatest bottleneck.
                         next_stage_url = None
 
                 elements.extend(page.get_child_elements())
+                if description_string := page.get_description():
+                    description.append(description_string)
 
         # Fill up our task list with the found elements
         await gather_elements()
 
+        if description:
+            await self._download_description(PurePath("."), description[0])
+
         elements.sort(key=lambda e: e.id())
 
         tasks: List[Awaitable[None]] = []
@@ -425,6 +441,19 @@ instance's greatest bottleneck.
 
         return self._download_booking(element, link_template_maybe, maybe_dl)
 
+    @anoncritical
+    @_iorepeat(1, "downloading description")
+    async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None:
+        path = parent_path / "Description.html"
+        dl = await self.download(path, redownload=Redownload.ALWAYS)
+        if not dl:
+            return
+
+        async with dl as (bar, sink):
+            description = clean(insert_base_markup(description))
+            sink.file.write(description.prettify().encode("utf-8"))
+            sink.done()
+
     @anoncritical
     @_iorepeat(3, "resolving booking")
     async def _download_booking(