From f5c4e828160cf408fcaffd1300ed5920976a8580 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 2 Nov 2024 22:17:26 +0100 Subject: [PATCH] Delay ilias loop detection after transform This allows users to filter out duplicated elements and suppress the warning. --- CHANGELOG.md | 2 ++ PFERD/crawl/ilias/ilias_web_crawler.py | 36 +++++++++++++++++--------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9431bc..3926f7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ ambiguous situations. ### Changed - Remove videos from description pages +- Perform ILIAS cycle detection after processing the transform to allow + ignoring duplicated elements ### Fixed - Personal desktop/dashboard/favorites crawling diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 14dde89..941b265 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -197,20 +197,23 @@ instance's greatest bottleneck. async def _handle_ilias_page( self, url: str, - parent: Optional[IliasPageElement], + current_element: Optional[IliasPageElement], path: PurePath, expected_course_id: Optional[int] = None, ) -> Optional[Coroutine[Any, Any, None]]: maybe_cl = await self.crawl(path) if not maybe_cl: return None - return self._crawl_ilias_page(url, parent, maybe_cl, expected_course_id) + if current_element: + self._ensure_not_seen(current_element, path) + + return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id) @anoncritical async def _crawl_ilias_page( self, url: str, - parent: Optional[IliasPageElement], + current_element: Optional[IliasPageElement], cl: CrawlToken, expected_course_id: Optional[int] = None, ) -> None: @@ -223,7 +226,7 @@ instance's greatest bottleneck. elements.clear() async with cl: next_stage_url: Optional[str] = url - current_parent = parent + current_parent = current_element while next_stage_url: soup = await self._get_page(next_stage_url) @@ -276,14 +279,6 @@ instance's greatest bottleneck. parent_path: PurePath, element: IliasPageElement, ) -> Optional[Coroutine[Any, Any, None]]: - if element.url in self._visited_urls: - raise CrawlWarning( - f"Found second path to element {element.name!r} at {element.url!r}. " - + f"First path: {fmt_path(self._visited_urls[element.url])}. " - + f"Second path: {fmt_path(parent_path)}." - ) - self._visited_urls[element.url] = parent_path - # element.name might contain `/` if the crawler created nested elements, # so we can not sanitize it here. We trust in the output dir to thwart worst-case # directory escape attacks. @@ -424,6 +419,8 @@ instance's greatest bottleneck. if not maybe_dl: return None + self._ensure_not_seen(element, element_path) + return self._download_booking(element, link_template_maybe, maybe_dl) @anoncritical @@ -498,6 +495,8 @@ instance's greatest bottleneck. if not maybe_dl: return None + self._ensure_not_seen(element, element_path) + # If we have every file from the cached mapping already, we can ignore this and bail if self._all_opencast_videos_locally_present(element, maybe_dl.path): # Mark all existing videos as known to ensure they do not get deleted during cleanup. @@ -596,6 +595,8 @@ instance's greatest bottleneck. maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return None + self._ensure_not_seen(element, element_path) + return self._download_file(element, maybe_dl, is_video) @_iorepeat(3, "downloading file") @@ -731,6 +732,8 @@ instance's greatest bottleneck. maybe_cl = await self.crawl(element_path) if not maybe_cl: return None + self._ensure_not_seen(element, element_path) + return self._crawl_learning_module(element, maybe_cl) @_iorepeat(3, "crawling learning module") @@ -853,6 +856,15 @@ instance's greatest bottleneck. elem.attrs["src"] = "https:" + elem.attrs["src"] return tag + def _ensure_not_seen(self, element: IliasPageElement, parent_path: PurePath) -> None: + if element.url in self._visited_urls: + raise CrawlWarning( + f"Found second path to element {element.name!r} at {element.url!r}. " + + f"First path: {fmt_path(self._visited_urls[element.url])}. " + + f"Second path: {fmt_path(parent_path)}." + ) + self._visited_urls[element.url] = parent_path + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: