Move HTTP crawler to own file

2026-02-18 23:02:23 +01:00 · 2021-05-22 23:23:21 +02:00
parent 4d07de0d71
commit 3053278721
3 changed files with 77 additions and 69 deletions
--- a/PFERD/crawler.py
+++ b/PFERD/crawler.py
@@ -1,10 +1,8 @@
-import asyncio
 from abc import ABC, abstractmethod
 from datetime import datetime
 from pathlib import Path, PurePath
 from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar

-import aiohttp
 from rich.markup import escape

 from .authenticator import Authenticator
@@ -15,7 +13,6 @@ from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, Ou
 from .report import MarkConflictError, MarkDuplicateError
 from .transformer import Transformer
 from .utils import ReusableAsyncContextManager
-from .version import NAME, VERSION


 class CrawlWarning(Exception):
@@ -285,68 +282,3 @@ class Crawler(ABC):
        """

        pass
-
-
-class HttpCrawler(Crawler):
-    COOKIE_FILE = PurePath(".cookies")
-
-    def __init__(
-            self,
-            name: str,
-            section: CrawlerSection,
-            config: Config,
-    ) -> None:
-        super().__init__(name, section, config)
-
-        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
-        self._output_dir.register_reserved(self.COOKIE_FILE)
-        self._authentication_id = 0
-        self._authentication_lock = asyncio.Lock()
-
-    async def prepare_request(self) -> int:
-        # We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
-        # This should reduce the amount of requests we make: If an authentication is in progress
-        # all future requests wait for authentication to complete.
-        async with self._authentication_lock:
-            return self._authentication_id
-
-    async def authenticate(self, current_id: int) -> None:
-        async with self._authentication_lock:
-            # Another thread successfully called authenticate in between
-            # We do not want to perform auth again, so return here. We can
-            # assume auth suceeded as authenticate will throw an error if
-            # it failed.
-            if current_id != self._authentication_id:
-                return
-            await self._authenticate()
-            self._authentication_id += 1
-
-    async def _authenticate(self) -> None:
-        """
-        Performs authentication. This method must only return normally if authentication suceeded.
-        In all other cases it mus either retry internally or throw a terminal exception.
-        """
-        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
-
-    async def run(self) -> None:
-        cookie_jar = aiohttp.CookieJar()
-
-        try:
-            cookie_jar.load(self._cookie_jar_path)
-        except Exception:
-            pass
-
-        async with aiohttp.ClientSession(
-                headers={"User-Agent": f"{NAME}/{VERSION}"},
-                cookie_jar=cookie_jar,
-        ) as session:
-            self.session = session
-            try:
-                await super().run()
-            finally:
-                del self.session
-
-        try:
-            cookie_jar.save(self._cookie_jar_path)
-        except Exception:
-            log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -11,7 +11,8 @@ from rich.markup import escape

 from PFERD.authenticators import Authenticator
 from PFERD.config import Config
-from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
+from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical
+from PFERD.http_crawler import HttpCrawler
 from PFERD.logging import ProgressBar, log
 from PFERD.output_dir import FileSink, Redownload
 from PFERD.utils import soupify, url_set_query_param
--- a/PFERD/http_crawler.py
+++ b/PFERD/http_crawler.py
@@ -0,0 +1,75 @@
+import asyncio
+from pathlib import PurePath
+
+import aiohttp
+from rich.markup import escape
+
+from .config import Config
+from .crawler import Crawler, CrawlerSection
+from .logging import log
+from .version import NAME, VERSION
+
+
+class HttpCrawler(Crawler):
+    COOKIE_FILE = PurePath(".cookies")
+
+    def __init__(
+            self,
+            name: str,
+            section: CrawlerSection,
+            config: Config,
+    ) -> None:
+        super().__init__(name, section, config)
+
+        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
+        self._output_dir.register_reserved(self.COOKIE_FILE)
+        self._authentication_id = 0
+        self._authentication_lock = asyncio.Lock()
+
+    async def prepare_request(self) -> int:
+        # We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
+        # This should reduce the amount of requests we make: If an authentication is in progress
+        # all future requests wait for authentication to complete.
+        async with self._authentication_lock:
+            return self._authentication_id
+
+    async def authenticate(self, current_id: int) -> None:
+        async with self._authentication_lock:
+            # Another thread successfully called authenticate in between
+            # We do not want to perform auth again, so return here. We can
+            # assume auth suceeded as authenticate will throw an error if
+            # it failed.
+            if current_id != self._authentication_id:
+                return
+            await self._authenticate()
+            self._authentication_id += 1
+
+    async def _authenticate(self) -> None:
+        """
+        Performs authentication. This method must only return normally if authentication suceeded.
+        In all other cases it mus either retry internally or throw a terminal exception.
+        """
+        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
+
+    async def run(self) -> None:
+        cookie_jar = aiohttp.CookieJar()
+
+        try:
+            cookie_jar.load(self._cookie_jar_path)
+        except Exception:
+            pass
+
+        async with aiohttp.ClientSession(
+                headers={"User-Agent": f"{NAME}/{VERSION}"},
+                cookie_jar=cookie_jar,
+        ) as session:
+            self.session = session
+            try:
+                await super().run()
+            finally:
+                del self.session
+
+        try:
+            cookie_jar.save(self._cookie_jar_path)
+        except Exception:
+            log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")