mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-07-12 22:22:30 +02:00
Delay ilias loop detection after transform
This allows users to filter out duplicated elements and suppress the warning.
This commit is contained in:
@ -27,6 +27,8 @@ ambiguous situations.
|
|||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Remove videos from description pages
|
- Remove videos from description pages
|
||||||
|
- Perform ILIAS cycle detection after processing the transform to allow
|
||||||
|
ignoring duplicated elements
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Personal desktop/dashboard/favorites crawling
|
- Personal desktop/dashboard/favorites crawling
|
||||||
|
@ -197,20 +197,23 @@ instance's greatest bottleneck.
|
|||||||
async def _handle_ilias_page(
|
async def _handle_ilias_page(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
parent: Optional[IliasPageElement],
|
current_element: Optional[IliasPageElement],
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
expected_course_id: Optional[int] = None,
|
expected_course_id: Optional[int] = None,
|
||||||
) -> Optional[Coroutine[Any, Any, None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
maybe_cl = await self.crawl(path)
|
maybe_cl = await self.crawl(path)
|
||||||
if not maybe_cl:
|
if not maybe_cl:
|
||||||
return None
|
return None
|
||||||
return self._crawl_ilias_page(url, parent, maybe_cl, expected_course_id)
|
if current_element:
|
||||||
|
self._ensure_not_seen(current_element, path)
|
||||||
|
|
||||||
|
return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id)
|
||||||
|
|
||||||
@anoncritical
|
@anoncritical
|
||||||
async def _crawl_ilias_page(
|
async def _crawl_ilias_page(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
parent: Optional[IliasPageElement],
|
current_element: Optional[IliasPageElement],
|
||||||
cl: CrawlToken,
|
cl: CrawlToken,
|
||||||
expected_course_id: Optional[int] = None,
|
expected_course_id: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -223,7 +226,7 @@ instance's greatest bottleneck.
|
|||||||
elements.clear()
|
elements.clear()
|
||||||
async with cl:
|
async with cl:
|
||||||
next_stage_url: Optional[str] = url
|
next_stage_url: Optional[str] = url
|
||||||
current_parent = parent
|
current_parent = current_element
|
||||||
|
|
||||||
while next_stage_url:
|
while next_stage_url:
|
||||||
soup = await self._get_page(next_stage_url)
|
soup = await self._get_page(next_stage_url)
|
||||||
@ -276,14 +279,6 @@ instance's greatest bottleneck.
|
|||||||
parent_path: PurePath,
|
parent_path: PurePath,
|
||||||
element: IliasPageElement,
|
element: IliasPageElement,
|
||||||
) -> Optional[Coroutine[Any, Any, None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
if element.url in self._visited_urls:
|
|
||||||
raise CrawlWarning(
|
|
||||||
f"Found second path to element {element.name!r} at {element.url!r}. "
|
|
||||||
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
|
|
||||||
+ f"Second path: {fmt_path(parent_path)}."
|
|
||||||
)
|
|
||||||
self._visited_urls[element.url] = parent_path
|
|
||||||
|
|
||||||
# element.name might contain `/` if the crawler created nested elements,
|
# element.name might contain `/` if the crawler created nested elements,
|
||||||
# so we can not sanitize it here. We trust in the output dir to thwart worst-case
|
# so we can not sanitize it here. We trust in the output dir to thwart worst-case
|
||||||
# directory escape attacks.
|
# directory escape attacks.
|
||||||
@ -424,6 +419,8 @@ instance's greatest bottleneck.
|
|||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
self._ensure_not_seen(element, element_path)
|
||||||
|
|
||||||
return self._download_booking(element, link_template_maybe, maybe_dl)
|
return self._download_booking(element, link_template_maybe, maybe_dl)
|
||||||
|
|
||||||
@anoncritical
|
@anoncritical
|
||||||
@ -498,6 +495,8 @@ instance's greatest bottleneck.
|
|||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
self._ensure_not_seen(element, element_path)
|
||||||
|
|
||||||
# If we have every file from the cached mapping already, we can ignore this and bail
|
# If we have every file from the cached mapping already, we can ignore this and bail
|
||||||
if self._all_opencast_videos_locally_present(element, maybe_dl.path):
|
if self._all_opencast_videos_locally_present(element, maybe_dl.path):
|
||||||
# Mark all existing videos as known to ensure they do not get deleted during cleanup.
|
# Mark all existing videos as known to ensure they do not get deleted during cleanup.
|
||||||
@ -596,6 +595,8 @@ instance's greatest bottleneck.
|
|||||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
return None
|
return None
|
||||||
|
self._ensure_not_seen(element, element_path)
|
||||||
|
|
||||||
return self._download_file(element, maybe_dl, is_video)
|
return self._download_file(element, maybe_dl, is_video)
|
||||||
|
|
||||||
@_iorepeat(3, "downloading file")
|
@_iorepeat(3, "downloading file")
|
||||||
@ -731,6 +732,8 @@ instance's greatest bottleneck.
|
|||||||
maybe_cl = await self.crawl(element_path)
|
maybe_cl = await self.crawl(element_path)
|
||||||
if not maybe_cl:
|
if not maybe_cl:
|
||||||
return None
|
return None
|
||||||
|
self._ensure_not_seen(element, element_path)
|
||||||
|
|
||||||
return self._crawl_learning_module(element, maybe_cl)
|
return self._crawl_learning_module(element, maybe_cl)
|
||||||
|
|
||||||
@_iorepeat(3, "crawling learning module")
|
@_iorepeat(3, "crawling learning module")
|
||||||
@ -853,6 +856,15 @@ instance's greatest bottleneck.
|
|||||||
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
def _ensure_not_seen(self, element: IliasPageElement, parent_path: PurePath) -> None:
|
||||||
|
if element.url in self._visited_urls:
|
||||||
|
raise CrawlWarning(
|
||||||
|
f"Found second path to element {element.name!r} at {element.url!r}. "
|
||||||
|
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
|
||||||
|
+ f"Second path: {fmt_path(parent_path)}."
|
||||||
|
)
|
||||||
|
self._visited_urls[element.url] = parent_path
|
||||||
|
|
||||||
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
|
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
|
||||||
auth_id = await self._current_auth_id()
|
auth_id = await self._current_auth_id()
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
|
Reference in New Issue
Block a user