diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index 4bc3161..afb7005 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -62,15 +62,17 @@ class IliasPage:
log.explain("Page is a normal folder, searching for elements")
return self._find_normal_entries()
+ def get_next_stage_url(self) -> Optional[str]:
+ if self._is_ilias_opencast_embedding():
+ return self.get_child_elements()[0].url
+ return None
+
def _is_video_player(self) -> bool:
return "paella_config_file" in str(self._soup)
def _is_video_listing(self) -> bool:
- # ILIAS fluff around it
- if self._soup.find(id="headerimage"):
- element: Tag = self._soup.find(id="headerimage")
- if "opencast" in element.attrs["src"].lower():
- return True
+ if self._is_ilias_opencast_embedding():
+ return True
# Raw listing without ILIAS fluff
video_element_table: Tag = self._soup.find(
@@ -78,6 +80,14 @@ class IliasPage:
)
return video_element_table is not None
+ def _is_ilias_opencast_embedding(self) -> bool:
+ # ILIAS fluff around the real opencast html
+ if self._soup.find(id="headerimage"):
+ element: Tag = self._soup.find(id="headerimage")
+ if "opencast" in element.attrs["src"].lower():
+ return True
+ return False
+
def _is_exercise_file(self) -> bool:
# we know it from before
if self._page_type == IliasElementType.EXERCISE:
diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
index 0b20d1c..12a6e79 100644
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -242,10 +242,14 @@ class KitIliasWebCrawler(HttpCrawler):
async def gather_elements() -> None:
elements.clear()
async with cl:
- soup = await self._get_page(url)
+ next_stage_url: Optional[str] = url
log.explain_topic(f"Parsing HTML page for {fmt_path(path)}")
- log.explain(f"URL: {url}")
- page = IliasPage(soup, url, parent)
+
+ while next_stage_url:
+ soup = await self._get_page(next_stage_url)
+ log.explain(f"URL: {url}")
+ page = IliasPage(soup, url, parent)
+ next_stage_url = page.get_next_stage_url()
elements.extend(page.get_child_elements())