Unwrap future meetings when ILIAS hides them behind a pagination

2025-07-20 01:42:37 +02:00 · 2022-10-24 14:33:58 +02:00
parent fb4631ba18
commit 5fdd40204b
2 changed files with 39 additions and 10 deletions
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -146,11 +146,17 @@ class IliasPage:
        if self._is_forum_page():
            if "trows=800" in self._page_url:
                return None
+            log.explain("Requesting *all* forum threads")
            return self._get_show_max_forum_entries_per_page_url()
        if self._is_ilias_opencast_embedding():
+            log.explain("Unwrapping opencast embedding")
            return self.get_child_elements()[0]
        if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED:
+            log.explain("Unwrapping video pagination")
            return self._find_video_entries_paginated()[0]
+        if self._contains_collapsed_future_meetings():
+            log.explain("Requesting *all* future meetings")
+            return self._uncollapse_future_meetings_url()
        return None

    def _is_forum_page(self) -> bool:
@@ -203,6 +209,16 @@ class IliasPage:
            return False
        return "target=copa_" in link.get("value")

+    def _contains_collapsed_future_meetings(self) -> bool:
+        return self._uncollapse_future_meetings_url() is not None
+
+    def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
+        element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x})
+        if not element:
+            return None
+        link = self._abs_url_from_link(element)
+        return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
+
    def _player_to_video(self) -> List[IliasPageElement]:
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
@@ -793,6 +809,10 @@ class IliasPage:
        if img_tag is None:
            img_tag = found_parent.select_one("img.icon")

+        if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}):
+            log.explain("Found session expansion button, skipping it as it has no content")
+            return None
+
        if img_tag is None:
            _unexpected_html_warning()
            log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -234,19 +234,28 @@ instance's greatest bottleneck.
        async def gather_elements() -> None:
            elements.clear()
            async with cl:
-                soup = await self._get_page(url)
-
-                if expected_id is not None:
-                    perma_link_element: Tag = soup.find(id="current_perma_link")
-                    if not perma_link_element or "crs_" not in perma_link_element.get("value"):
-                        raise CrawlError("Invalid course id? Didn't find anything looking like a course")
+                next_stage_url: Optional[str] = url
+                current_parent = None

                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
-                log.explain_topic("Parsing root HTML page")
-                log.explain(f"URL: {url}")
-                page = IliasPage(soup, url, None)
-                elements.extend(page.get_child_elements())
+                while next_stage_url:
+                    soup = await self._get_page(next_stage_url)

+                    if current_parent is None and expected_id is not None:
+                        perma_link_element: Tag = soup.find(id="current_perma_link")
+                        if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")
+
+                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
+                    log.explain(f"URL: {next_stage_url}")
+                    page = IliasPage(soup, next_stage_url, current_parent)
+                    if next_element := page.get_next_stage_element():
+                        current_parent = next_element
+                        next_stage_url = next_element.url
+                    else:
+                        next_stage_url = None
+
+                elements.extend(page.get_child_elements())
                if description_string := page.get_description():
                    description.append(description_string)