Adjust to new crawl / download names

2026-01-09 14:02:31 +01:00 · 2021-05-22 23:18:05 +02:00
parent e724ff7c93
commit 953a1bba93
1 changed files with 23 additions and 26 deletions
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -116,7 +116,7 @@ class KitIliasWebCrawler(HttpCrawler):
        self._link_file_redirect_delay = section.link_file_redirect_delay()
        self._link_file_use_plaintext = section.link_file_use_plaintext()

-    async def crawl(self) -> None:
+    async def _run(self) -> None:
        if isinstance(self._target, int):
            log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
            await self._crawl_course(self._target)
@@ -127,11 +127,6 @@ class KitIliasWebCrawler(HttpCrawler):
            log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
            await self._crawl_url(self._target)

-        if self.error_free:
-            await self.cleanup()
-        else:
-            log.explain_topic("Skipping file cleanup as errors occurred earlier")
-
    async def _crawl_course(self, course_id: int) -> None:
        # Start crawling at the given course
        root_url = url_set_query_param(
@@ -144,10 +139,14 @@ class KitIliasWebCrawler(HttpCrawler):
        await self._crawl_url(self._base_url)

    async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
+        cl = await self.crawl(PurePath("."))
+        if not cl:
+            return
+
        tasks = []

        # TODO: Retry this when the crawl and download bar are reworked
-        async with self.crawl_bar(PurePath("Root element")):
+        async with cl:
            soup = await self._get_page(url)

            if expected_id is not None:
@@ -165,14 +164,12 @@ class KitIliasWebCrawler(HttpCrawler):
        await asyncio.gather(*tasks)

    async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
-        # We might not want to crawl this directory-ish page.
-        # This is not in #handle_element, as the download methods check it themselves and therefore
-        # would perform this check twice - messing with the explain output
-        if not self.should_crawl(path):
+        cl = await self.crawl(path)
+        if not cl:
            return

        tasks = []
-        async with self.crawl_bar(path):
+        async with cl:
            soup = await self._get_page(url)
            page = IliasPage(soup, url, parent)

@@ -189,7 +186,9 @@ class KitIliasWebCrawler(HttpCrawler):
        if element.type == IliasElementType.FILE:
            await self._download_file(element, element_path)
        elif element.type == IliasElementType.FORUM:
-            log.explain_topic(f"Skipping forum at {escape(str(element_path))}")
+            log.explain_topic(f"Decision: Crawl {escape(str(element_path))}")
+            log.explain("Is a forum")
+            log.explain("Answer: No")
        elif element.type == IliasElementType.LINK:
            await self._download_link(element, element_path)
        elif element.type == IliasElementType.VIDEO:
@@ -208,13 +207,12 @@ class KitIliasWebCrawler(HttpCrawler):
        if not dl:
            return

-        async with self.download_bar(element_path):
+        async with dl as (bar, sink):
            export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
            async with self.session.get(export_url) as response:
                html_page: BeautifulSoup = soupify(await response.read())
                real_url: str = html_page.select_one("a").get("href").strip()

-            async with dl as sink:
            content = link_template_plain if self._link_file_use_plaintext else link_template_rich
            content = content.replace("{{link}}", real_url)
            content = content.replace("{{name}}", element.name)
@@ -229,11 +227,10 @@ class KitIliasWebCrawler(HttpCrawler):
        if not dl:
            return

-        async with self.download_bar(element_path) as bar:
+        async with dl as (bar, sink):
            page = IliasPage(await self._get_page(element.url), element.url, element)
            real_element = page.get_child_elements()[0]

-            async with dl as sink:
            await self._stream_from_url(real_element.url, sink, bar)

    async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
@@ -241,7 +238,7 @@ class KitIliasWebCrawler(HttpCrawler):
        if not dl:
            return

-        async with self.download_bar(element_path) as bar, dl as sink:
+        async with dl as (bar, sink):
            await self._stream_from_url(element.url, sink, bar)

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: