Add HttpCrawler

This commit is contained in:
Joscha 2021-05-13 22:28:14 +02:00
parent 961f40f9a1
commit d565df27b3
4 changed files with 51 additions and 5 deletions

View File

@ -6,6 +6,7 @@ from pathlib import Path, PurePath
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
Callable, Dict, Optional, TypeVar) Callable, Dict, Optional, TypeVar)
import aiohttp
from rich.markup import escape from rich.markup import escape
from .authenticator import Authenticator from .authenticator import Authenticator
@ -263,3 +264,39 @@ class Crawler(ABC):
""" """
pass pass
class HttpCrawler(Crawler):
COOKIE_FILE = PurePath(".cookies")
def __init__(
self,
name: str,
section: CrawlerSection,
config: Config,
conductor: TerminalConductor,
) -> None:
super().__init__(name, section, config, conductor)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
async def run(self) -> None:
cookie_jar = aiohttp.CookieJar()
try:
cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
async with aiohttp.ClientSession(cookie_jar=cookie_jar) as session:
self.session = session
try:
await super().run()
finally:
del self.session
try:
cookie_jar.save(self._cookie_jar_path)
except Exception:
self.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")

View File

@ -86,6 +86,9 @@ class OutputDirectory:
self._report = Report() self._report = Report()
def register_reserved(self, path: PurePath):
self._report.mark_reserved(path)
def _mark(self, path: PurePath) -> None: def _mark(self, path: PurePath) -> None:
""" """
May throw an OutputDirException May throw an OutputDirException
@ -100,7 +103,7 @@ class OutputDirectory:
msg = f"Collides with other file: {e.collides_with}" msg = f"Collides with other file: {e.collides_with}"
raise OutputDirException(msg) raise OutputDirException(msg)
def _resolve(self, path: PurePath) -> Path: def resolve(self, path: PurePath) -> Path:
""" """
May throw an OutputDirException. May throw an OutputDirException.
""" """

View File

@ -44,12 +44,16 @@ class Report:
""" """
def __init__(self) -> None: def __init__(self) -> None:
self.reserved_files: Set[PurePath] = set()
self.known_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set()
self.new_files: Set[PurePath] = set() self.new_files: Set[PurePath] = set()
self.changed_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set()
self.deleted_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set()
def mark_reserved(self, path: PurePath) -> None:
self.reserved_files.add(path)
def mark(self, path: PurePath) -> None: def mark(self, path: PurePath) -> None:
""" """
Mark a previously unknown file as known. Mark a previously unknown file as known.
@ -58,12 +62,12 @@ class Report:
detail, see the respective exception's docstring. detail, see the respective exception's docstring.
""" """
for known_path in self.known_files: for other in self.known_files & self.reserved_files:
if path == known_path: if path == other:
raise MarkDuplicateException(path) raise MarkDuplicateException(path)
if is_relative_to(path, known_path) or is_relative_to(known_path, path): if is_relative_to(path, other) or is_relative_to(other, path):
raise MarkConflictException(path, known_path) raise MarkConflictException(path, other)
self.known_files.add(path) self.known_files.add(path)

View File

@ -6,6 +6,8 @@ version = 3.0.0
packages = PFERD packages = PFERD
python_requires = >=3.8 python_requires = >=3.8
install_requires = install_requires =
aiohttp>=3.7.4.post0
beautifulsoup4>=4.9.3
rich>=10.1.0 rich>=10.1.0
[options.entry_points] [options.entry_points]