From 846c29aee1867f7f0b7efae802af47fee77a3ec6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 11 May 2022 21:16:09 +0200 Subject: [PATCH] Download page descriptions --- CHANGELOG.md | 3 + PFERD/crawl/ilias/ilias_html_cleaner.py | 91 ++++++++++++++++++++++ PFERD/crawl/ilias/kit_ilias_html.py | 25 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 +++++++ 4 files changed, 148 insertions(+) create mode 100644 PFERD/crawl/ilias/ilias_html_cleaner.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d3840..b7cad13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Download of page descriptions + ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py new file mode 100644 index 0000000..5952309 --- /dev/null +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -0,0 +1,91 @@ +from bs4 import BeautifulSoup, Comment, Tag + +_STYLE_TAG_CONTENT = """ + .ilc_text_block_Information { + background-color: #f5f7fa; + } + div.ilc_text_block_Standard { + margin-bottom: 10px; + margin-top: 10px; + } + span.ilc_text_inline_Strong { + font-weight: bold; + } + + .accordion-head { + background-color: #f5f7fa; + padding: 0.5rem 0; + } + + h3 { + margin-top: 0.5rem; + margin-bottom: 1rem; + } + + br.visible-break { + margin-bottom: 1rem; + } + + article { + margin: 0.5rem 0; + } + + body { + padding: 1em; + grid-template-columns: 1fr min(60rem, 90%) 1fr; + line-height: 1.2; + } +""" + +_ARTICLE_WORTHY_CLASSES = [ + "ilc_text_block_Information", + "ilc_section_Attention", + "ilc_section_Link", +] + + +def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: + head = soup.new_tag("head") + soup.insert(0, head) + + simplecss_link: Tag = soup.new_tag("link") + # + simplecss_link["rel"] = "stylesheet" + simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" + head.append(simplecss_link) + + # Basic style tags for compat + style: Tag = soup.new_tag("style") + style.append(_STYLE_TAG_CONTENT) + head.append(style) + + return soup + + +def clean(soup: BeautifulSoup) -> BeautifulSoup: + for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): + block.name = "article" + + for block in soup.find_all("h3"): + block.name = "div" + + for block in soup.find_all("h1"): + block.name = "h3" + + for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): + block.name = "h3" + block["class"] += ["accordion-head"] + + for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): + children = list(dummy.children) + if not children: + dummy.decompose() + if len(children) > 1: + continue + if type(children[0]) == Comment: + dummy.decompose() + + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): + hrule_imposter.insert(0, soup.new_tag("hr")) + + return soup diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 6d063b6..d58e5c8 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -85,6 +85,31 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_description(self) -> Optional[BeautifulSoup]: + def is_interesting_class(name: str) -> bool: + return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] + + paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) + if not paragraphs: + return None + + # Extract bits and pieces into a string and parse it again. + # This ensures we don't miss anything and weird structures are resolved + # somewhat gracefully. + raw_html = "" + for p in paragraphs: + if p.find_parent(class_=is_interesting_class): + continue + + # Ignore special listings (like folder groupings) + if "ilc_section_Special" in p["class"]: + continue + + raw_html += str(p) + "\n" + raw_html = f"\n{raw_html}\n" + + return BeautifulSoup(raw_html, "html.parser") + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae9ebd4..bbed986 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links +from .ilias_html_cleaner import clean, insert_base_markup from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -215,6 +216,8 @@ instance's greatest bottleneck. cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling url") async def gather_elements() -> None: @@ -233,9 +236,15 @@ instance's greatest bottleneck. page = IliasPage(soup, url, None) elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) + # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -265,6 +274,8 @@ instance's greatest bottleneck. cl: CrawlToken, ) -> None: elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling folder") async def gather_elements() -> None: @@ -285,10 +296,15 @@ instance's greatest bottleneck. next_stage_url = None elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -425,6 +441,19 @@ instance's greatest bottleneck. return self._download_booking(element, link_template_maybe, maybe_dl) + @anoncritical + @_iorepeat(1, "downloading description") + async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None: + path = parent_path / "Description.html" + dl = await self.download(path, redownload=Redownload.ALWAYS) + if not dl: + return + + async with dl as (bar, sink): + description = clean(insert_base_markup(description)) + sink.file.write(description.prettify().encode("utf-8")) + sink.done() + @anoncritical @_iorepeat(3, "resolving booking") async def _download_booking(