From d565df27b31f5a7e635edc6d069d80cf65b1c3ef Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 22:28:14 +0200 Subject: [PATCH] Add HttpCrawler --- PFERD/crawler.py | 37 +++++++++++++++++++++++++++++++++++++ PFERD/output_dir.py | 5 ++++- PFERD/report.py | 12 ++++++++---- setup.cfg | 2 ++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index da35801..feb3f25 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -6,6 +6,7 @@ from pathlib import Path, PurePath from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Dict, Optional, TypeVar) +import aiohttp from rich.markup import escape from .authenticator import Authenticator @@ -263,3 +264,39 @@ class Crawler(ABC): """ pass + + +class HttpCrawler(Crawler): + COOKIE_FILE = PurePath(".cookies") + + def __init__( + self, + name: str, + section: CrawlerSection, + config: Config, + conductor: TerminalConductor, + ) -> None: + super().__init__(name, section, config, conductor) + + self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) + self._output_dir.register_reserved(self.COOKIE_FILE) + + async def run(self) -> None: + cookie_jar = aiohttp.CookieJar() + + try: + cookie_jar.load(self._cookie_jar_path) + except Exception: + pass + + async with aiohttp.ClientSession(cookie_jar=cookie_jar) as session: + self.session = session + try: + await super().run() + finally: + del self.session + + try: + cookie_jar.save(self._cookie_jar_path) + except Exception: + self.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 571d73d..1be9a16 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -86,6 +86,9 @@ class OutputDirectory: self._report = Report() + def register_reserved(self, path: PurePath): + self._report.mark_reserved(path) + def _mark(self, path: PurePath) -> None: """ May throw an OutputDirException @@ -100,7 +103,7 @@ class OutputDirectory: msg = f"Collides with other file: {e.collides_with}" raise OutputDirException(msg) - def _resolve(self, path: PurePath) -> Path: + def resolve(self, path: PurePath) -> Path: """ May throw an OutputDirException. """ diff --git a/PFERD/report.py b/PFERD/report.py index b98c90c..2c7d8af 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -44,12 +44,16 @@ class Report: """ def __init__(self) -> None: + self.reserved_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set() self.new_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + def mark_reserved(self, path: PurePath) -> None: + self.reserved_files.add(path) + def mark(self, path: PurePath) -> None: """ Mark a previously unknown file as known. @@ -58,12 +62,12 @@ class Report: detail, see the respective exception's docstring. """ - for known_path in self.known_files: - if path == known_path: + for other in self.known_files & self.reserved_files: + if path == other: raise MarkDuplicateException(path) - if is_relative_to(path, known_path) or is_relative_to(known_path, path): - raise MarkConflictException(path, known_path) + if is_relative_to(path, other) or is_relative_to(other, path): + raise MarkConflictException(path, other) self.known_files.add(path) diff --git a/setup.cfg b/setup.cfg index 1c6e764..9dcb111 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,8 @@ version = 3.0.0 packages = PFERD python_requires = >=3.8 install_requires = + aiohttp>=3.7.4.post0 + beautifulsoup4>=4.9.3 rich>=10.1.0 [options.entry_points]