diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index 9ea6b9f..2f0011e 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -146,11 +146,17 @@ class IliasPage:
if self._is_forum_page():
if "trows=800" in self._page_url:
return None
+ log.explain("Requesting *all* forum threads")
return self._get_show_max_forum_entries_per_page_url()
if self._is_ilias_opencast_embedding():
+ log.explain("Unwrapping opencast embedding")
return self.get_child_elements()[0]
if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED:
+ log.explain("Unwrapping video pagination")
return self._find_video_entries_paginated()[0]
+ if self._contains_collapsed_future_meetings():
+ log.explain("Requesting *all* future meetings")
+ return self._uncollapse_future_meetings_url()
return None
def _is_forum_page(self) -> bool:
@@ -203,6 +209,16 @@ class IliasPage:
return False
return "target=copa_" in link.get("value")
+ def _contains_collapsed_future_meetings(self) -> bool:
+ return self._uncollapse_future_meetings_url() is not None
+
+ def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
+ element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x})
+ if not element:
+ return None
+ link = self._abs_url_from_link(element)
+ return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
+
def _player_to_video(self) -> List[IliasPageElement]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
@@ -793,6 +809,10 @@ class IliasPage:
if img_tag is None:
img_tag = found_parent.select_one("img.icon")
+ if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}):
+ log.explain("Found session expansion button, skipping it as it has no content")
+ return None
+
if img_tag is None:
_unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
index 10a270f..bc0d816 100644
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -234,19 +234,28 @@ instance's greatest bottleneck.
async def gather_elements() -> None:
elements.clear()
async with cl:
- soup = await self._get_page(url)
-
- if expected_id is not None:
- perma_link_element: Tag = soup.find(id="current_perma_link")
- if not perma_link_element or "crs_" not in perma_link_element.get("value"):
- raise CrawlError("Invalid course id? Didn't find anything looking like a course")
+ next_stage_url: Optional[str] = url
+ current_parent = None
# Duplicated code, but the root page is special - we want to avoid fetching it twice!
- log.explain_topic("Parsing root HTML page")
- log.explain(f"URL: {url}")
- page = IliasPage(soup, url, None)
- elements.extend(page.get_child_elements())
+ while next_stage_url:
+ soup = await self._get_page(next_stage_url)
+ if current_parent is None and expected_id is not None:
+ perma_link_element: Tag = soup.find(id="current_perma_link")
+ if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+ raise CrawlError("Invalid course id? Didn't find anything looking like a course")
+
+ log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
+ log.explain(f"URL: {next_stage_url}")
+ page = IliasPage(soup, next_stage_url, current_parent)
+ if next_element := page.get_next_stage_element():
+ current_parent = next_element
+ next_stage_url = next_element.url
+ else:
+ next_stage_url = None
+
+ elements.extend(page.get_child_elements())
if description_string := page.get_description():
description.append(description_string)