Move HTTP crawler to own file

This commit is contained in:
I-Al-Istannen 2021-05-22 23:23:21 +02:00
parent 4d07de0d71
commit 3053278721
3 changed files with 77 additions and 69 deletions

View File

@ -1,10 +1,8 @@
import asyncio
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from datetime import datetime from datetime import datetime
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar
import aiohttp
from rich.markup import escape from rich.markup import escape
from .authenticator import Authenticator from .authenticator import Authenticator
@ -15,7 +13,6 @@ from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, Ou
from .report import MarkConflictError, MarkDuplicateError from .report import MarkConflictError, MarkDuplicateError
from .transformer import Transformer from .transformer import Transformer
from .utils import ReusableAsyncContextManager from .utils import ReusableAsyncContextManager
from .version import NAME, VERSION
class CrawlWarning(Exception): class CrawlWarning(Exception):
@ -285,68 +282,3 @@ class Crawler(ABC):
""" """
pass pass
class HttpCrawler(Crawler):
COOKIE_FILE = PurePath(".cookies")
def __init__(
self,
name: str,
section: CrawlerSection,
config: Config,
) -> None:
super().__init__(name, section, config)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
self._authentication_id = 0
self._authentication_lock = asyncio.Lock()
async def prepare_request(self) -> int:
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
# This should reduce the amount of requests we make: If an authentication is in progress
# all future requests wait for authentication to complete.
async with self._authentication_lock:
return self._authentication_id
async def authenticate(self, current_id: int) -> None:
async with self._authentication_lock:
# Another thread successfully called authenticate in between
# We do not want to perform auth again, so return here. We can
# assume auth suceeded as authenticate will throw an error if
# it failed.
if current_id != self._authentication_id:
return
await self._authenticate()
self._authentication_id += 1
async def _authenticate(self) -> None:
"""
Performs authentication. This method must only return normally if authentication suceeded.
In all other cases it mus either retry internally or throw a terminal exception.
"""
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
async def run(self) -> None:
cookie_jar = aiohttp.CookieJar()
try:
cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
async with aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"},
cookie_jar=cookie_jar,
) as session:
self.session = session
try:
await super().run()
finally:
del self.session
try:
cookie_jar.save(self._cookie_jar_path)
except Exception:
log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")

View File

@ -11,7 +11,8 @@ from rich.markup import escape
from PFERD.authenticators import Authenticator from PFERD.authenticators import Authenticator
from PFERD.config import Config from PFERD.config import Config
from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical
from PFERD.http_crawler import HttpCrawler
from PFERD.logging import ProgressBar, log from PFERD.logging import ProgressBar, log
from PFERD.output_dir import FileSink, Redownload from PFERD.output_dir import FileSink, Redownload
from PFERD.utils import soupify, url_set_query_param from PFERD.utils import soupify, url_set_query_param

75
PFERD/http_crawler.py Normal file
View File

@ -0,0 +1,75 @@
import asyncio
from pathlib import PurePath
import aiohttp
from rich.markup import escape
from .config import Config
from .crawler import Crawler, CrawlerSection
from .logging import log
from .version import NAME, VERSION
class HttpCrawler(Crawler):
COOKIE_FILE = PurePath(".cookies")
def __init__(
self,
name: str,
section: CrawlerSection,
config: Config,
) -> None:
super().__init__(name, section, config)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
self._authentication_id = 0
self._authentication_lock = asyncio.Lock()
async def prepare_request(self) -> int:
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
# This should reduce the amount of requests we make: If an authentication is in progress
# all future requests wait for authentication to complete.
async with self._authentication_lock:
return self._authentication_id
async def authenticate(self, current_id: int) -> None:
async with self._authentication_lock:
# Another thread successfully called authenticate in between
# We do not want to perform auth again, so return here. We can
# assume auth suceeded as authenticate will throw an error if
# it failed.
if current_id != self._authentication_id:
return
await self._authenticate()
self._authentication_id += 1
async def _authenticate(self) -> None:
"""
Performs authentication. This method must only return normally if authentication suceeded.
In all other cases it mus either retry internally or throw a terminal exception.
"""
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
async def run(self) -> None:
cookie_jar = aiohttp.CookieJar()
try:
cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
async with aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"},
cookie_jar=cookie_jar,
) as session:
self.session = session
try:
await super().run()
finally:
del self.session
try:
cookie_jar.save(self._cookie_jar_path)
except Exception:
log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")