Delay ilias loop detection after transform

This allows users to filter out duplicated elements and suppress the
warning.
This commit is contained in:
I-Al-Istannen
2024-11-02 22:17:26 +01:00
parent f5273f7ca0
commit f5c4e82816
2 changed files with 26 additions and 12 deletions

View File

@ -27,6 +27,8 @@ ambiguous situations.
### Changed ### Changed
- Remove videos from description pages - Remove videos from description pages
- Perform ILIAS cycle detection after processing the transform to allow
ignoring duplicated elements
### Fixed ### Fixed
- Personal desktop/dashboard/favorites crawling - Personal desktop/dashboard/favorites crawling

View File

@ -197,20 +197,23 @@ instance's greatest bottleneck.
async def _handle_ilias_page( async def _handle_ilias_page(
self, self,
url: str, url: str,
parent: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
path: PurePath, path: PurePath,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path) maybe_cl = await self.crawl(path)
if not maybe_cl: if not maybe_cl:
return None return None
return self._crawl_ilias_page(url, parent, maybe_cl, expected_course_id) if current_element:
self._ensure_not_seen(current_element, path)
return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id)
@anoncritical @anoncritical
async def _crawl_ilias_page( async def _crawl_ilias_page(
self, self,
url: str, url: str,
parent: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
cl: CrawlToken, cl: CrawlToken,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
) -> None: ) -> None:
@ -223,7 +226,7 @@ instance's greatest bottleneck.
elements.clear() elements.clear()
async with cl: async with cl:
next_stage_url: Optional[str] = url next_stage_url: Optional[str] = url
current_parent = parent current_parent = current_element
while next_stage_url: while next_stage_url:
soup = await self._get_page(next_stage_url) soup = await self._get_page(next_stage_url)
@ -276,14 +279,6 @@ instance's greatest bottleneck.
parent_path: PurePath, parent_path: PurePath,
element: IliasPageElement, element: IliasPageElement,
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
if element.url in self._visited_urls:
raise CrawlWarning(
f"Found second path to element {element.name!r} at {element.url!r}. "
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
+ f"Second path: {fmt_path(parent_path)}."
)
self._visited_urls[element.url] = parent_path
# element.name might contain `/` if the crawler created nested elements, # element.name might contain `/` if the crawler created nested elements,
# so we can not sanitize it here. We trust in the output dir to thwart worst-case # so we can not sanitize it here. We trust in the output dir to thwart worst-case
# directory escape attacks. # directory escape attacks.
@ -424,6 +419,8 @@ instance's greatest bottleneck.
if not maybe_dl: if not maybe_dl:
return None return None
self._ensure_not_seen(element, element_path)
return self._download_booking(element, link_template_maybe, maybe_dl) return self._download_booking(element, link_template_maybe, maybe_dl)
@anoncritical @anoncritical
@ -498,6 +495,8 @@ instance's greatest bottleneck.
if not maybe_dl: if not maybe_dl:
return None return None
self._ensure_not_seen(element, element_path)
# If we have every file from the cached mapping already, we can ignore this and bail # If we have every file from the cached mapping already, we can ignore this and bail
if self._all_opencast_videos_locally_present(element, maybe_dl.path): if self._all_opencast_videos_locally_present(element, maybe_dl.path):
# Mark all existing videos as known to ensure they do not get deleted during cleanup. # Mark all existing videos as known to ensure they do not get deleted during cleanup.
@ -596,6 +595,8 @@ instance's greatest bottleneck.
maybe_dl = await self.download(element_path, mtime=element.mtime) maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl: if not maybe_dl:
return None return None
self._ensure_not_seen(element, element_path)
return self._download_file(element, maybe_dl, is_video) return self._download_file(element, maybe_dl, is_video)
@_iorepeat(3, "downloading file") @_iorepeat(3, "downloading file")
@ -731,6 +732,8 @@ instance's greatest bottleneck.
maybe_cl = await self.crawl(element_path) maybe_cl = await self.crawl(element_path)
if not maybe_cl: if not maybe_cl:
return None return None
self._ensure_not_seen(element, element_path)
return self._crawl_learning_module(element, maybe_cl) return self._crawl_learning_module(element, maybe_cl)
@_iorepeat(3, "crawling learning module") @_iorepeat(3, "crawling learning module")
@ -853,6 +856,15 @@ instance's greatest bottleneck.
elem.attrs["src"] = "https:" + elem.attrs["src"] elem.attrs["src"] = "https:" + elem.attrs["src"]
return tag return tag
def _ensure_not_seen(self, element: IliasPageElement, parent_path: PurePath) -> None:
if element.url in self._visited_urls:
raise CrawlWarning(
f"Found second path to element {element.name!r} at {element.url!r}. "
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
+ f"Second path: {fmt_path(parent_path)}."
)
self._visited_urls[element.url] = parent_path
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
async with self.session.get(url) as request: async with self.session.get(url) as request: