Do not stop crawling files when encountering a CrawlWarning

This commit is contained in:
I-Al-Istannen 2021-11-06 12:09:51 +01:00
parent 13b8c3d9c6
commit 6289938d7c

View File

@ -84,7 +84,7 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
def decorator(f: AWrapped) -> AWrapped: def decorator(f: AWrapped) -> AWrapped:
async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
last_exception: Optional[BaseException] = None last_exception: Optional[BaseException] = None
@ -105,6 +105,9 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
if last_exception: if last_exception:
message = f"Error in I/O Operation: {last_exception}" message = f"Error in I/O Operation: {last_exception}"
if failure_is_error:
raise CrawlError(message) from last_exception
else:
raise CrawlWarning(message) from last_exception raise CrawlWarning(message) from last_exception
raise CrawlError("Impossible return in ilias _iorepeat") raise CrawlError("Impossible return in ilias _iorepeat")
@ -251,6 +254,7 @@ instance's greatest bottleneck.
return None return None
return self._crawl_ilias_page(url, parent, maybe_cl) return self._crawl_ilias_page(url, parent, maybe_cl)
@anoncritical
async def _crawl_ilias_page( async def _crawl_ilias_page(
self, self,
url: str, url: str,
@ -292,10 +296,12 @@ instance's greatest bottleneck.
# And execute them # And execute them
await self.gather(tasks) await self.gather(tasks)
# These decorators only apply *to this method* and *NOT* to the returned
# awaitables!
# This method does not await the handlers but returns them instead.
# This ensures one level is handled at a time and name deduplication
# works correctly.
@anoncritical @anoncritical
# Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical.
# If that happens we will be terminated as anoncritical doesn't tream them as non-critical.
@_wrap_io_in_warning("handling ilias element")
async def _handle_ilias_element( async def _handle_ilias_element(
self, self,
parent_path: PurePath, parent_path: PurePath,
@ -363,6 +369,7 @@ instance's greatest bottleneck.
return self._download_link(element, link_template_maybe, maybe_dl) return self._download_link(element, link_template_maybe, maybe_dl)
@anoncritical
@_iorepeat(3, "resolving link") @_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
@ -409,6 +416,7 @@ instance's greatest bottleneck.
return self._download_booking(element, link_template_maybe, maybe_dl) return self._download_booking(element, link_template_maybe, maybe_dl)
@anoncritical
@_iorepeat(3, "resolving booking") @_iorepeat(3, "resolving booking")
async def _download_booking( async def _download_booking(
self, self,
@ -488,6 +496,7 @@ instance's greatest bottleneck.
log.explain("Missing at least one video, continuing with requests!") log.explain("Missing at least one video, continuing with requests!")
return False return False
@anoncritical
@_iorepeat(3, "downloading video") @_iorepeat(3, "downloading video")
async def _download_video( async def _download_video(
self, self,
@ -534,6 +543,7 @@ instance's greatest bottleneck.
return None return None
return self._download_file(element, maybe_dl) return self._download_file(element, maybe_dl)
@anoncritical
@_iorepeat(3, "downloading file") @_iorepeat(3, "downloading file")
async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None:
assert dl # The function is only reached when dl is not None assert dl # The function is only reached when dl is not None
@ -589,7 +599,7 @@ instance's greatest bottleneck.
# We repeat this as the login method in shibboleth doesn't handle I/O errors. # We repeat this as the login method in shibboleth doesn't handle I/O errors.
# Shibboleth is quite reliable as well, the repeat is likely not critical here. # Shibboleth is quite reliable as well, the repeat is likely not critical here.
@_iorepeat(3, "Login") @_iorepeat(3, "Login", failure_is_error=True)
async def _authenticate(self) -> None: async def _authenticate(self) -> None:
await self._shibboleth_login.login(self.session) await self._shibboleth_login.login(self.session)