diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 4bc3161..afb7005 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -62,15 +62,17 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_next_stage_url(self) -> Optional[str]: + if self._is_ilias_opencast_embedding(): + return self.get_child_elements()[0].url + return None + def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) def _is_video_listing(self) -> bool: - # ILIAS fluff around it - if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "opencast" in element.attrs["src"].lower(): - return True + if self._is_ilias_opencast_embedding(): + return True # Raw listing without ILIAS fluff video_element_table: Tag = self._soup.find( @@ -78,6 +80,14 @@ class IliasPage: ) return video_element_table is not None + def _is_ilias_opencast_embedding(self) -> bool: + # ILIAS fluff around the real opencast html + if self._soup.find(id="headerimage"): + element: Tag = self._soup.find(id="headerimage") + if "opencast" in element.attrs["src"].lower(): + return True + return False + def _is_exercise_file(self) -> bool: # we know it from before if self._page_type == IliasElementType.EXERCISE: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 0b20d1c..12a6e79 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -242,10 +242,14 @@ class KitIliasWebCrawler(HttpCrawler): async def gather_elements() -> None: elements.clear() async with cl: - soup = await self._get_page(url) + next_stage_url: Optional[str] = url log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") - log.explain(f"URL: {url}") - page = IliasPage(soup, url, parent) + + while next_stage_url: + soup = await self._get_page(next_stage_url) + log.explain(f"URL: {url}") + page = IliasPage(soup, url, parent) + next_stage_url = page.get_next_stage_url() elements.extend(page.get_child_elements())