Collapse ilias url crawling into normal page crawling

2025-12-03 05:42:31 +01:00 · 2024-11-02 21:53:47 +01:00
parent fa71a9f44f
commit f5273f7ca0
1 changed files with 16 additions and 58 deletions
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -191,79 +191,28 @@ instance's greatest bottleneck.
        )
    async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
-        maybe_cl = await self.crawl(PurePath("."))
+        if awaitable := await self._handle_ilias_page(url, None, PurePath("."), expected_id):
-        if not maybe_cl:
+            await awaitable
            return
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
        elements: List[IliasPageElement] = []
        # A list as variable redefinitions are not propagated to outer scopes
        description: List[BeautifulSoup] = []
        @_iorepeat(3, "crawling url")
        async def gather_elements() -> None:
            elements.clear()
            async with cl:
                next_stage_url: Optional[str] = url
                current_parent = None
                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
                while next_stage_url:
                    soup = await self._get_page(next_stage_url, root_page_allowed=True)
                    if current_parent is None and expected_id is not None:
                        perma_link = IliasPage.get_soup_permalink(soup)
                        if not perma_link or "crs_" not in perma_link:
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")
                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
                    log.explain(f"URL: {next_stage_url}")
                    page = IliasPage(soup, next_stage_url, current_parent)
                    if next_element := page.get_next_stage_element():
                        current_parent = next_element
                        next_stage_url = next_element.url
                    else:
                        next_stage_url = None
                elements.extend(page.get_child_elements())
                if info_tab := page.get_info_tab():
                    elements.append(info_tab)
                if description_string := page.get_description():
                    description.append(description_string)
        # Fill up our task list with the found elements
        await gather_elements()
        if description:
            await self._download_description(PurePath("."), description[0])
        elements.sort(key=lambda e: e.id())
        tasks: List[Awaitable[None]] = []
        for element in elements:
            if handle := await self._handle_ilias_element(PurePath("."), element):
                tasks.append(asyncio.create_task(handle))
        # And execute them
        await self.gather(tasks)
    async def _handle_ilias_page(
        self,
        url: str,
-        parent: IliasPageElement,
+        parent: Optional[IliasPageElement],
        path: PurePath,
        expected_course_id: Optional[int] = None,
    ) -> Optional[Coroutine[Any, Any, None]]:
        maybe_cl = await self.crawl(path)
        if not maybe_cl:
            return None
-        return self._crawl_ilias_page(url, parent, maybe_cl)
+        return self._crawl_ilias_page(url, parent, maybe_cl, expected_course_id)
    @anoncritical
    async def _crawl_ilias_page(
        self,
        url: str,
-        parent: IliasPageElement,
+        parent: Optional[IliasPageElement],
        cl: CrawlToken,
        expected_course_id: Optional[int] = None,
    ) -> None:
        elements: List[IliasPageElement] = []
        # A list as variable redefinitions are not propagated to outer scopes
@@ -280,6 +229,15 @@ instance's greatest bottleneck.
                    soup = await self._get_page(next_stage_url)
                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
                    log.explain(f"URL: {next_stage_url}")
                    # If we expect to find a root course, enforce it
                    if current_parent is None and expected_course_id is not None:
                        perma_link = IliasPage.get_soup_permalink(soup)
                        if not perma_link or "crs_" not in perma_link:
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")
                        if str(expected_course_id) not in perma_link:
                            raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}")
                    page = IliasPage(soup, next_stage_url, current_parent)
                    if next_element := page.get_next_stage_element():
                        current_parent = next_element