From 83d12fcf2d75650033154c77926728798a4bb541 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 20 May 2021 14:58:54 +0200 Subject: [PATCH] Add some explains to ilias crawler and use crawler exceptions --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index a025127..88732c0 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -6,10 +6,12 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union import aiohttp from bs4 import BeautifulSoup, Tag +from rich.markup import escape from PFERD.authenticators import Authenticator from PFERD.config import Config -from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical +from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical +from PFERD.logging import log from PFERD.output_dir import Redownload from PFERD.utils import soupify, url_set_query_param @@ -66,10 +68,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) -def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: +def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None: - for _ in range(attempts - 1): + last_exception: Optional[BaseException] = None + for round in range(attempts): try: await f(self, *args, **kwargs) return @@ -77,12 +80,17 @@ def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: raise CrawlWarning("ILIAS returned an invalid content type") except aiohttp.TooManyRedirects: raise CrawlWarning("Got stuck in a redirect loop") - except aiohttp.ClientPayloadError: # encoding or not enough bytes - pass - except aiohttp.ClientConnectionError: # e.g. timeout, disconnect, resolve failed, etc. - pass + except aiohttp.ClientPayloadError as e: # encoding or not enough bytes + last_exception = e + except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. + last_exception = e + log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}") + + if last_exception: + message = f"Error in I/O Operation: {escape(str(last_exception))}" + raise CrawlWarning(message) from last_exception + raise CrawlError("Impossible return in ilias _iorepeat") - await f(self, *args, **kwargs) return wrapper # type: ignore return decorator @@ -109,14 +117,19 @@ class KitIliasWebCrawler(HttpCrawler): async def crawl(self) -> None: if isinstance(self._target, int): + log.explain_topic(f"Inferred crawl target: Course with id {self._target}") await self._crawl_course(self._target) elif self._target == "desktop": + log.explain_topic("Inferred crawl target: Personal desktop") await self._crawl_desktop() else: + log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}") await self._crawl_url(self._target) if self.error_free: await self.cleanup() + else: + log.explain_topic("Skipping file cleanup as errors occurred earlier") async def _crawl_course(self, course_id: int) -> None: # Start crawling at the given course @@ -132,15 +145,16 @@ class KitIliasWebCrawler(HttpCrawler): async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: tasks = [] + # TODO: Retry this when the crawl and download bar are reworked async with self.crawl_bar(PurePath("Root element")): soup = await self._get_page(url) if expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") if not perma_link_element or "crs_" not in perma_link_element.get("value"): - # TODO: Properly handle error - raise RuntimeError( - "Invalid course id? I didn't find anything looking like a course!") + raise CrawlError( + "Invalid course id? I didn't find anything looking like a course" + ) # Duplicated code, but the root page is special - we want to void fetching it twice! page = IliasPage(soup, url, None) @@ -167,15 +181,14 @@ class KitIliasWebCrawler(HttpCrawler): await asyncio.gather(*tasks) @anoncritical - @_iorepeat(3) + @_iorepeat(3, "ILIAS element crawling") async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: - # TODO: Delete - print(f"Skipping forum [green]{element_path}[/]") + log.explain_topic(f"Skipping forum at {escape(str(element_path))}") elif element.type == IliasElementType.LINK: await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: @@ -185,8 +198,9 @@ class KitIliasWebCrawler(HttpCrawler): elif element.type in _DIRECTORY_PAGES: await self._handle_ilias_page(element.url, element, element_path) else: - # TODO: Proper exception - raise RuntimeError(f"Unknown type: {element.type!r}") + # This will retry it a few times, failing everytime. It doesn't make any network + # requests, so that's fine. + raise CrawlWarning(f"Unknown element type: {element.type!r}") async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime)