Move HTTP crawler to own file

2025-10-20 00:32:33 +02:00 · 2021-05-22 23:23:21 +02:00
parent 4d07de0d71
commit 3053278721
3 changed files with 77 additions and 69 deletions
--- a/PFERD/crawler.py
+++ b/PFERD/crawler.py
@@ -1,10 +1,8 @@
 import asyncio
 from abc import ABC, abstractmethod
 from datetime import datetime
 from pathlib import Path, PurePath
 from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar
 import aiohttp
 from rich.markup import escape
 from .authenticator import Authenticator
@@ -15,7 +13,6 @@ from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, Ou
 from .report import MarkConflictError, MarkDuplicateError
 from .transformer import Transformer
 from .utils import ReusableAsyncContextManager
 from .version import NAME, VERSION
 class CrawlWarning(Exception):
@@ -285,68 +282,3 @@ class Crawler(ABC):
        """
        pass
 class HttpCrawler(Crawler):
    COOKIE_FILE = PurePath(".cookies")
    def __init__(
            self,
            name: str,
            section: CrawlerSection,
            config: Config,
    ) -> None:
        super().__init__(name, section, config)
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._output_dir.register_reserved(self.COOKIE_FILE)
        self._authentication_id = 0
        self._authentication_lock = asyncio.Lock()
    async def prepare_request(self) -> int:
        # We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
        # This should reduce the amount of requests we make: If an authentication is in progress
        # all future requests wait for authentication to complete.
        async with self._authentication_lock:
            return self._authentication_id
    async def authenticate(self, current_id: int) -> None:
        async with self._authentication_lock:
            # Another thread successfully called authenticate in between
            # We do not want to perform auth again, so return here. We can
            # assume auth suceeded as authenticate will throw an error if
            # it failed.
            if current_id != self._authentication_id:
                return
            await self._authenticate()
            self._authentication_id += 1
    async def _authenticate(self) -> None:
        """
        Performs authentication. This method must only return normally if authentication suceeded.
        In all other cases it mus either retry internally or throw a terminal exception.
        """
        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
    async def run(self) -> None:
        cookie_jar = aiohttp.CookieJar()
        try:
            cookie_jar.load(self._cookie_jar_path)
        except Exception:
            pass
        async with aiohttp.ClientSession(
                headers={"User-Agent": f"{NAME}/{VERSION}"},
                cookie_jar=cookie_jar,
        ) as session:
            self.session = session
            try:
                await super().run()
            finally:
                del self.session
        try:
            cookie_jar.save(self._cookie_jar_path)
        except Exception:
            log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -11,7 +11,8 @@ from rich.markup import escape
 from PFERD.authenticators import Authenticator
 from PFERD.config import Config
-from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
+from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical
 from PFERD.http_crawler import HttpCrawler
 from PFERD.logging import ProgressBar, log
 from PFERD.output_dir import FileSink, Redownload
 from PFERD.utils import soupify, url_set_query_param
--- a/PFERD/http_crawler.py
+++ b/PFERD/http_crawler.py
@@ -0,0 +1,75 @@
 import asyncio
 from pathlib import PurePath
 import aiohttp
 from rich.markup import escape
 from .config import Config
 from .crawler import Crawler, CrawlerSection
 from .logging import log
 from .version import NAME, VERSION
 class HttpCrawler(Crawler):
    COOKIE_FILE = PurePath(".cookies")
    def __init__(
            self,
            name: str,
            section: CrawlerSection,
            config: Config,
    ) -> None:
        super().__init__(name, section, config)
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._output_dir.register_reserved(self.COOKIE_FILE)
        self._authentication_id = 0
        self._authentication_lock = asyncio.Lock()
    async def prepare_request(self) -> int:
        # We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
        # This should reduce the amount of requests we make: If an authentication is in progress
        # all future requests wait for authentication to complete.
        async with self._authentication_lock:
            return self._authentication_id
    async def authenticate(self, current_id: int) -> None:
        async with self._authentication_lock:
            # Another thread successfully called authenticate in between
            # We do not want to perform auth again, so return here. We can
            # assume auth suceeded as authenticate will throw an error if
            # it failed.
            if current_id != self._authentication_id:
                return
            await self._authenticate()
            self._authentication_id += 1
    async def _authenticate(self) -> None:
        """
        Performs authentication. This method must only return normally if authentication suceeded.
        In all other cases it mus either retry internally or throw a terminal exception.
        """
        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
    async def run(self) -> None:
        cookie_jar = aiohttp.CookieJar()
        try:
            cookie_jar.load(self._cookie_jar_path)
        except Exception:
            pass
        async with aiohttp.ClientSession(
                headers={"User-Agent": f"{NAME}/{VERSION}"},
                cookie_jar=cookie_jar,
        ) as session:
            self.session = session
            try:
                await super().run()
            finally:
                del self.session
        try:
            cookie_jar.save(self._cookie_jar_path)
        except Exception:
            log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")