Only enclose get_page in iorepeat in ILIAS crawler

We previously also gathered in there, which could lead to some more surprises when the method was retried.
2025-07-19 01:12:38 +02:00 · 2021-05-23 17:24:05 +02:00
parent 29d5a40c57
commit 05ad06fbc1
1 changed files with 24 additions and 27 deletions
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -1,6 +1,6 @@
 import re
 from pathlib import PurePath
-from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union
 import aiohttp
 from aiohttp import hdrs
@@ -192,10 +192,11 @@ class KitIliasWebCrawler(HttpCrawler):
            return
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
-        @_iorepeat(3, "crawling url")
+        elements: List[IliasPageElement] = []
        async def impl() -> None:
            tasks = []
        @_iorepeat(3, "crawling url")
        async def gather_elements() -> None:
            elements.clear()
            async with cl:
                soup = await self._get_page(url)
@@ -204,19 +205,16 @@ class KitIliasWebCrawler(HttpCrawler):
                    if not perma_link_element or "crs_" not in perma_link_element.get("value"):
                        raise CrawlError("Invalid course id? Didn't find anything looking like a course")
-                # Duplicated code, but the root page is special - we want to void fetching it twice!
+                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
                page = IliasPage(soup, url, None)
-                for child in page.get_child_elements():
+                elements.extend(page.get_child_elements())
                    tasks.append(self._handle_ilias_element(PurePath("."), child))
-            # The only point an I/O exception can be thrown is in `get_page`.
+        # Fill up our task list with the found elements
-            # If that happens, no task was spawned yet. Therefore, we only retry
+        await gather_elements()
-            # this method without having spawned a single task. Due to this we do
+        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
            # not need to cancel anything or worry about this gather call or the forks
            # further up.
            await self.gather(tasks)
-        await impl()
+        # And execute them
        await self.gather(tasks)
    async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
        maybe_cl = await self.crawl(path)
@@ -224,28 +222,27 @@ class KitIliasWebCrawler(HttpCrawler):
            return
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
        elements: List[IliasPageElement] = []
        @_iorepeat(3, "crawling folder")
-        async def impl() -> None:
+        async def gather_elements() -> None:
-            tasks = []
+            elements.clear()
            async with cl:
                soup = await self._get_page(url)
                page = IliasPage(soup, url, parent)
-                for child in page.get_child_elements():
+                elements.extend(page.get_child_elements())
                    tasks.append(self._handle_ilias_element(path, child))
-            # The only point an I/O exception can be thrown is in `get_page`.
+        # Fill up our task list with the found elements
-            # If that happens, no task was spawned yet. Therefore, we only retry
+        await gather_elements()
-            # this method without having spawned a single task. Due to this we do
+        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
            # not need to cancel anything or worry about this gather call or the forks
            # further up.
            await self.gather(tasks)
-        await impl()
+        # And execute them
        await self.gather(tasks)
    @anoncritical
-    # Shouldn't happen but this method must never raise an I/O error as that might interfere with
+    # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical.
-    # handle_ilias_page or crawl_url
+    # If that happens we will be terminated as anoncritical doesn't tream them as non-critical.
    @_wrap_io_in_warning("handling ilias element")
    async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
        element_path = PurePath(parent_path, element.name)