From 6e4d423c812c52aff95249ad992dc4889d971208 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Jun 2021 16:50:29 +0200 Subject: [PATCH] Crawl all video stages in one crawl bar This ensures folders are not renamed, as they are crawled twice --- PFERD/crawl/ilias/kit_ilias_html.py | 6 ++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 384f0de..41f45e2 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -62,9 +62,11 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() - def get_next_stage_url(self) -> Optional[str]: + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): - return self.get_child_elements()[0].url + return self.get_child_elements()[0] + if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + return self._find_video_entries_paginated()[0] return None def _is_video_player(self) -> bool: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 6495da9..41c301c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -248,13 +248,18 @@ instance's greatest bottleneck. elements.clear() async with cl: next_stage_url: Optional[str] = url + current_parent = parent while next_stage_url: soup = await self._get_page(next_stage_url) log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") log.explain(f"URL: {next_stage_url}") - page = IliasPage(soup, next_stage_url, parent) - next_stage_url = page.get_next_stage_url() + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None elements.extend(page.get_child_elements())