Download page descriptions

2025-11-28 19:32:29 +01:00 · 2022-05-11 21:16:09 +02:00
parent a5015fe9b1
commit 846c29aee1
4 changed files with 148 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,9 @@ ambiguous situations.
 ## Unreleased
 ### Added
 - Download of page descriptions
 ### Changed
 - Add `cpp` extension to default `link_regex` of IPD crawler
 - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option
--- a/PFERD/crawl/ilias/ilias_html_cleaner.py
+++ b/PFERD/crawl/ilias/ilias_html_cleaner.py
@@ -0,0 +1,91 @@
 from bs4 import BeautifulSoup, Comment, Tag
 _STYLE_TAG_CONTENT = """
    .ilc_text_block_Information {
      background-color: #f5f7fa;
    }
    div.ilc_text_block_Standard {
      margin-bottom: 10px;
      margin-top: 10px;
    }
    span.ilc_text_inline_Strong {
      font-weight: bold;
    }
    .accordion-head {
      background-color: #f5f7fa;
      padding: 0.5rem 0;
    }
    h3 {
      margin-top: 0.5rem;
      margin-bottom: 1rem;
    }
    br.visible-break {
      margin-bottom: 1rem;
    }
    article {
      margin: 0.5rem 0;
    }
    body {
      padding: 1em;
      grid-template-columns: 1fr min(60rem, 90%) 1fr;
      line-height: 1.2;
    }
 """
 _ARTICLE_WORTHY_CLASSES = [
    "ilc_text_block_Information",
    "ilc_section_Attention",
    "ilc_section_Link",
 ]
 def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
    head = soup.new_tag("head")
    soup.insert(0, head)
    simplecss_link: Tag = soup.new_tag("link")
    # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
    simplecss_link["rel"] = "stylesheet"
    simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
    head.append(simplecss_link)
    # Basic style tags for compat
    style: Tag = soup.new_tag("style")
    style.append(_STYLE_TAG_CONTENT)
    head.append(style)
    return soup
 def clean(soup: BeautifulSoup) -> BeautifulSoup:
    for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES):
        block.name = "article"
    for block in soup.find_all("h3"):
        block.name = "div"
    for block in soup.find_all("h1"):
        block.name = "h3"
    for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"):
        block.name = "h3"
        block["class"] += ["accordion-head"]
    for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"):
        children = list(dummy.children)
        if not children:
            dummy.decompose()
        if len(children) > 1:
            continue
        if type(children[0]) == Comment:
            dummy.decompose()
    for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
        hrule_imposter.insert(0, soup.new_tag("hr"))
    return soup
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -85,6 +85,31 @@ class IliasPage:
        log.explain("Page is a normal folder, searching for elements")
        return self._find_normal_entries()
    def get_description(self) -> Optional[BeautifulSoup]:
        def is_interesting_class(name: str) -> bool:
            return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
        paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class)
        if not paragraphs:
            return None
        # Extract bits and pieces into a string and parse it again.
        # This ensures we don't miss anything and weird structures are resolved
        # somewhat gracefully.
        raw_html = ""
        for p in paragraphs:
            if p.find_parent(class_=is_interesting_class):
                continue
            # Ignore special listings (like folder groupings)
            if "ilc_section_Special" in p["class"]:
                continue
            raw_html += str(p) + "\n"
        raw_html = f"<body>\n{raw_html}\n</body>"
        return BeautifulSoup(raw_html, "html.parser")
    def get_next_stage_element(self) -> Optional[IliasPageElement]:
        if self._is_ilias_opencast_embedding():
            return self.get_child_elements()[0]
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import Links
 from .ilias_html_cleaner import clean, insert_base_markup
 from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
 TargetType = Union[str, int]
@@ -215,6 +216,8 @@ instance's greatest bottleneck.
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
        elements: List[IliasPageElement] = []
        # A list as variable redefinitions are not propagated to outer scopes
        description: List[BeautifulSoup] = []
        @_iorepeat(3, "crawling url")
        async def gather_elements() -> None:
@@ -233,9 +236,15 @@ instance's greatest bottleneck.
                page = IliasPage(soup, url, None)
                elements.extend(page.get_child_elements())
                if description_string := page.get_description():
                    description.append(description_string)
        # Fill up our task list with the found elements
        await gather_elements()
        if description:
            await self._download_description(PurePath("."), description[0])
        elements.sort(key=lambda e: e.id())
        tasks: List[Awaitable[None]] = []
@@ -265,6 +274,8 @@ instance's greatest bottleneck.
        cl: CrawlToken,
    ) -> None:
        elements: List[IliasPageElement] = []
        # A list as variable redefinitions are not propagated to outer scopes
        description: List[BeautifulSoup] = []
        @_iorepeat(3, "crawling folder")
        async def gather_elements() -> None:
@@ -285,10 +296,15 @@ instance's greatest bottleneck.
                        next_stage_url = None
                elements.extend(page.get_child_elements())
                if description_string := page.get_description():
                    description.append(description_string)
        # Fill up our task list with the found elements
        await gather_elements()
        if description:
            await self._download_description(PurePath("."), description[0])
        elements.sort(key=lambda e: e.id())
        tasks: List[Awaitable[None]] = []
@@ -425,6 +441,19 @@ instance's greatest bottleneck.
        return self._download_booking(element, link_template_maybe, maybe_dl)
    @anoncritical
    @_iorepeat(1, "downloading description")
    async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None:
        path = parent_path / "Description.html"
        dl = await self.download(path, redownload=Redownload.ALWAYS)
        if not dl:
            return
        async with dl as (bar, sink):
            description = clean(insert_base_markup(description))
            sink.file.write(description.prettify().encode("utf-8"))
            sink.done()
    @anoncritical
    @_iorepeat(3, "resolving booking")
    async def _download_booking(