diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 9ea6b9f..2f0011e 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -146,11 +146,17 @@ class IliasPage: if self._is_forum_page(): if "trows=800" in self._page_url: return None + log.explain("Requesting *all* forum threads") return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): + log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + log.explain("Unwrapping video pagination") return self._find_video_entries_paginated()[0] + if self._contains_collapsed_future_meetings(): + log.explain("Requesting *all* future meetings") + return self._uncollapse_future_meetings_url() return None def _is_forum_page(self) -> bool: @@ -203,6 +209,16 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: + return self._uncollapse_future_meetings_url() is not None + + def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: + element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) + if not element: + return None + link = self._abs_url_from_link(element) + return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -793,6 +809,10 @@ class IliasPage: if img_tag is None: img_tag = found_parent.select_one("img.icon") + if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): + log.explain("Found session expansion button, skipping it as it has no content") + return None + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 10a270f..bc0d816 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -234,19 +234,28 @@ instance's greatest bottleneck. async def gather_elements() -> None: elements.clear() async with cl: - soup = await self._get_page(url) - - if expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): - raise CrawlError("Invalid course id? Didn't find anything looking like a course") + next_stage_url: Optional[str] = url + current_parent = None # Duplicated code, but the root page is special - we want to avoid fetching it twice! - log.explain_topic("Parsing root HTML page") - log.explain(f"URL: {url}") - page = IliasPage(soup, url, None) - elements.extend(page.get_child_elements()) + while next_stage_url: + soup = await self._get_page(next_stage_url) + if current_parent is None and expected_id is not None: + perma_link_element: Tag = soup.find(id="current_perma_link") + if not perma_link_element or "crs_" not in perma_link_element.get("value"): + raise CrawlError("Invalid course id? Didn't find anything looking like a course") + + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None + + elements.extend(page.get_child_elements()) if description_string := page.get_description(): description.append(description_string)