Add some explains to ilias crawler and use crawler exceptions

2025-10-22 17:42:33 +02:00 · 2021-05-20 14:58:54 +02:00
parent e4f9560655
commit 83d12fcf2d
1 changed files with 30 additions and 16 deletions
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -6,10 +6,12 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
 import aiohttp
 from bs4 import BeautifulSoup, Tag
 from rich.markup import escape
 from PFERD.authenticators import Authenticator
 from PFERD.config import Config
-from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
+from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
 from PFERD.logging import log
 from PFERD.output_dir import Redownload
 from PFERD.utils import soupify, url_set_query_param
@@ -66,10 +68,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
 AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
-def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
+def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
    def decorator(f: AWrapped) -> AWrapped:
        async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None:
-            for _ in range(attempts - 1):
+            last_exception: Optional[BaseException] = None
            for round in range(attempts):
                try:
                    await f(self, *args, **kwargs)
                    return
@@ -77,12 +80,17 @@ def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
                    raise CrawlWarning("ILIAS returned an invalid content type")
                except aiohttp.TooManyRedirects:
                    raise CrawlWarning("Got stuck in a redirect loop")
-                except aiohttp.ClientPayloadError:  # encoding or not enough bytes
+                except aiohttp.ClientPayloadError as e:  # encoding or not enough bytes
-                    pass
+                    last_exception = e
-                except aiohttp.ClientConnectionError:  # e.g. timeout, disconnect, resolve failed, etc.
+                except aiohttp.ClientConnectionError as e:  # e.g. timeout, disconnect, resolve failed, etc.
-                    pass
+                    last_exception = e
                log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}")
            if last_exception:
                message = f"Error in I/O Operation: {escape(str(last_exception))}"
                raise CrawlWarning(message) from last_exception
            raise CrawlError("Impossible return in ilias _iorepeat")
            await f(self, *args, **kwargs)
        return wrapper  # type: ignore
    return decorator
@@ -109,14 +117,19 @@ class KitIliasWebCrawler(HttpCrawler):
    async def crawl(self) -> None:
        if isinstance(self._target, int):
            log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
            await self._crawl_course(self._target)
        elif self._target == "desktop":
            log.explain_topic("Inferred crawl target: Personal desktop")
            await self._crawl_desktop()
        else:
            log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
            await self._crawl_url(self._target)
        if self.error_free:
            await self.cleanup()
        else:
            log.explain_topic("Skipping file cleanup as errors occurred earlier")
    async def _crawl_course(self, course_id: int) -> None:
        # Start crawling at the given course
@@ -132,15 +145,16 @@ class KitIliasWebCrawler(HttpCrawler):
    async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
        tasks = []
        # TODO: Retry this when the crawl and download bar are reworked
        async with self.crawl_bar(PurePath("Root element")):
            soup = await self._get_page(url)
            if expected_id is not None:
                perma_link_element: Tag = soup.find(id="current_perma_link")
                if not perma_link_element or "crs_" not in perma_link_element.get("value"):
-                    # TODO: Properly handle error
+                    raise CrawlError(
-                    raise RuntimeError(
+                        "Invalid course id? I didn't find anything looking like a course"
-                        "Invalid course id? I didn't find anything looking like a course!")
+                    )
            # Duplicated code, but the root page is special - we want to void fetching it twice!
            page = IliasPage(soup, url, None)
@@ -167,15 +181,14 @@ class KitIliasWebCrawler(HttpCrawler):
        await asyncio.gather(*tasks)
    @anoncritical
-    @_iorepeat(3)
+    @_iorepeat(3, "ILIAS element crawling")
    async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
        element_path = PurePath(parent_path, element.name)
        if element.type == IliasElementType.FILE:
            await self._download_file(element, element_path)
        elif element.type == IliasElementType.FORUM:
-            # TODO: Delete
+            log.explain_topic(f"Skipping forum at {escape(str(element_path))}")
            print(f"Skipping forum [green]{element_path}[/]")
        elif element.type == IliasElementType.LINK:
            await self._download_link(element, element_path)
        elif element.type == IliasElementType.VIDEO:
@@ -185,8 +198,9 @@ class KitIliasWebCrawler(HttpCrawler):
        elif element.type in _DIRECTORY_PAGES:
            await self._handle_ilias_page(element.url, element, element_path)
        else:
-            # TODO: Proper exception
+            # This will retry it a few times, failing everytime. It doesn't make any network
-            raise RuntimeError(f"Unknown type: {element.type!r}")
+            # requests, so that's fine.
            raise CrawlWarning(f"Unknown element type: {element.type!r}")
    async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
        dl = await self.download(element_path, mtime=element.mtime)