De-Async ilias crawler

2026-01-04 20:12:30 +01:00 · 2022-11-22 20:56:35 +01:00
parent a2ffce4702
commit 10e1a5e871
1 changed files with 45 additions and 118 deletions
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -126,13 +126,6 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
    return decorator


-def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]:
-    """
-    Wraps any I/O exception in a CrawlWarning.
-    """
-    return _iorepeat(1, name)
-
-
 # Crawler control flow:
 #
 #     crawl_desktop -+
@@ -226,80 +219,22 @@ instance's greatest bottleneck.
            return
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608

-        elements: List[IliasPageElement] = []
-        # A list as variable redefinitions are not propagated to outer scopes
-        description: List[BeautifulSoup] = []
-
-        @_iorepeat(3, "crawling url")
-        async def gather_elements() -> None:
-            elements.clear()
-            async with cl:
-                next_stage_url: Optional[str] = url
-                current_parent = None
-
-                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
-                while next_stage_url:
-                    soup = await self._get_page(next_stage_url)
-
-                    if current_parent is None and expected_id is not None:
+        def ensure_is_valid_course_id(parent: Optional[IliasPageElement], soup: BeautifulSoup) -> None:
+            if parent is None and expected_id is not None:
                perma_link_element: Tag = soup.find(id="current_perma_link")
                if not perma_link_element or "crs_" not in perma_link_element.get("value"):
                    raise CrawlError("Invalid course id? Didn't find anything looking like a course")

-                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
-                    log.explain(f"URL: {next_stage_url}")
-                    page = IliasPage(soup, next_stage_url, current_parent)
-                    if next_element := page.get_next_stage_element():
-                        current_parent = next_element
-                        next_stage_url = next_element.url
-                    else:
-                        next_stage_url = None
-
-                elements.extend(page.get_child_elements())
-                if description_string := page.get_description():
-                    description.append(description_string)
-
-        # Fill up our task list with the found elements
-        await gather_elements()
-
-        if description:
-            await self._download_description(PurePath("."), description[0])
-
-        elements.sort(key=lambda e: e.id())
-
-        tasks: List[Awaitable[None]] = []
-        for element in elements:
-            if handle := await self._handle_ilias_element(PurePath("."), element):
-                tasks.append(asyncio.create_task(handle))
-
-        # And execute them
-        await self.gather(tasks)
-
-    async def _handle_ilias_page(
-        self,
-        url: str,
-        parent: IliasPageElement,
-        path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
-        maybe_cl = await self.crawl(path)
-        if not maybe_cl:
-            return None
-        return self._crawl_ilias_page(url, parent, maybe_cl)
+        await self._crawl_ilias_page(url, None, cl, ensure_is_valid_course_id)

    @anoncritical
    async def _crawl_ilias_page(
        self,
        url: str,
-        parent: IliasPageElement,
+        parent: Optional[IliasPageElement],
        cl: CrawlToken,
+        next_stage_hook: Callable[[Optional[IliasPageElement], BeautifulSoup], None] = lambda a, b: None
    ) -> None:
-        elements: List[IliasPageElement] = []
-        # A list as variable redefinitions are not propagated to outer scopes
-        description: List[BeautifulSoup] = []
-
-        @_iorepeat(3, "crawling folder")
-        async def gather_elements() -> None:
-            elements.clear()
        async with cl:
            next_stage_url: Optional[str] = url
            current_parent = parent
@@ -308,6 +243,9 @@ instance's greatest bottleneck.
                soup = await self._get_page(next_stage_url)
                log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
                log.explain(f"URL: {next_stage_url}")
+
+                next_stage_hook(current_parent, soup)
+
                page = IliasPage(soup, next_stage_url, current_parent)
                if next_element := page.get_next_stage_element():
                    current_parent = next_element
@@ -315,25 +253,11 @@ instance's greatest bottleneck.
                else:
                    next_stage_url = None

-                elements.extend(page.get_child_elements())
+        for element in sorted(page.get_child_elements(), key=lambda e: e.id()):
+            await self._handle_ilias_element(cl.path, element)
+
        if description_string := page.get_description():
-                    description.append(description_string)
-
-        # Fill up our task list with the found elements
-        await gather_elements()
-
-        if description:
-            await self._download_description(cl.path, description[0])
-
-        elements.sort(key=lambda e: e.id())
-
-        tasks: List[Awaitable[None]] = []
-        for element in elements:
-            if handle := await self._handle_ilias_element(cl.path, element):
-                tasks.append(asyncio.create_task(handle))
-
-        # And execute them
-        await self.gather(tasks)
+            await self._download_description(cl.path, description_string)

    # These decorators only apply *to this method* and *NOT* to the returned
    # awaitables!
@@ -345,7 +269,7 @@ instance's greatest bottleneck.
        self,
        parent_path: PurePath,
        element: IliasPageElement,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        if element.url in self._visited_urls:
            raise CrawlWarning(
                f"Found second path to element {element.name!r} at {element.url!r}. "
@@ -367,7 +291,7 @@ instance's greatest bottleneck.
                return None

        if element.type == IliasElementType.FILE:
-            return await self._handle_file(element, element_path)
+            await self._handle_file(element, element_path)
        elif element.type == IliasElementType.FORUM:
            if not self._forums:
                log.status(
@@ -377,7 +301,7 @@ instance's greatest bottleneck.
                    "[bright_black](enable with option 'forums')"
                )
                return None
-            return await self._handle_forum(element, element_path)
+            await self._handle_forum(element, element_path)
        elif element.type == IliasElementType.TEST:
            log.status(
                "[bold bright_black]",
@@ -395,15 +319,18 @@ instance's greatest bottleneck.
            )
            return None
        elif element.type == IliasElementType.LINK:
-            return await self._handle_link(element, element_path)
+            await self._handle_link(element, element_path)
        elif element.type == IliasElementType.BOOKING:
-            return await self._handle_booking(element, element_path)
+            await self._handle_booking(element, element_path)
        elif element.type == IliasElementType.VIDEO:
-            return await self._handle_file(element, element_path)
+            await self._handle_file(element, element_path)
        elif element.type == IliasElementType.VIDEO_PLAYER:
-            return await self._handle_video(element, element_path)
+            await self._handle_video(element, element_path)
        elif element.type in _DIRECTORY_PAGES:
-            return await self._handle_ilias_page(element.url, element, element_path)
+            maybe_cl = await self.crawl(element_path)
+            if not maybe_cl:
+                return None
+            await self._crawl_ilias_page(element.url, element, maybe_cl)
        else:
            # This will retry it a few times, failing everytime. It doesn't make any network
            # requests, so that's fine.
@@ -413,7 +340,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
        log.explain(f"Links type is {self._links}")

@@ -430,7 +357,7 @@ instance's greatest bottleneck.
        if not maybe_dl:
            return None

-        return self._download_link(element, link_template_maybe, maybe_dl)
+        await self._download_link(element, link_template_maybe, maybe_dl)

    @anoncritical
    @_iorepeat(3, "resolving link")
@@ -522,7 +449,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        # Copy old mapping as it is likely still relevant
        if self.prev_report:
            self.report.add_custom_value(
@@ -548,7 +475,7 @@ instance's greatest bottleneck.

            return None

-        return self._download_video(element_path, element, maybe_dl)
+        await self._download_video(element_path, element, maybe_dl)

    def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]:
        if not self.prev_report:
@@ -630,11 +557,11 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
            return None
-        return self._download_file(element, maybe_dl)
+        await self._download_file(element, maybe_dl)

    @anoncritical
    @_iorepeat(3, "downloading file")
@@ -677,11 +604,11 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        maybe_cl = await self.crawl(element_path)
        if not maybe_cl:
            return None
-        return self._crawl_forum(element, maybe_cl)
+        await self._crawl_forum(element, maybe_cl)

    @_iorepeat(3, "crawling forum")
    @anoncritical