Add HttpCrawler

This commit is contained in:
Joscha 2021-05-13 22:28:14 +02:00
parent 961f40f9a1
commit d565df27b3
4 changed files with 51 additions and 5 deletions

View File

@ -6,6 +6,7 @@ from pathlib import Path, PurePath
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
Callable, Dict, Optional, TypeVar)
import aiohttp
from rich.markup import escape
from .authenticator import Authenticator
@ -263,3 +264,39 @@ class Crawler(ABC):
"""
pass
class HttpCrawler(Crawler):
COOKIE_FILE = PurePath(".cookies")
def __init__(
self,
name: str,
section: CrawlerSection,
config: Config,
conductor: TerminalConductor,
) -> None:
super().__init__(name, section, config, conductor)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
async def run(self) -> None:
cookie_jar = aiohttp.CookieJar()
try:
cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
async with aiohttp.ClientSession(cookie_jar=cookie_jar) as session:
self.session = session
try:
await super().run()
finally:
del self.session
try:
cookie_jar.save(self._cookie_jar_path)
except Exception:
self.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")

View File

@ -86,6 +86,9 @@ class OutputDirectory:
self._report = Report()
def register_reserved(self, path: PurePath):
self._report.mark_reserved(path)
def _mark(self, path: PurePath) -> None:
"""
May throw an OutputDirException
@ -100,7 +103,7 @@ class OutputDirectory:
msg = f"Collides with other file: {e.collides_with}"
raise OutputDirException(msg)
def _resolve(self, path: PurePath) -> Path:
def resolve(self, path: PurePath) -> Path:
"""
May throw an OutputDirException.
"""

View File

@ -44,12 +44,16 @@ class Report:
"""
def __init__(self) -> None:
self.reserved_files: Set[PurePath] = set()
self.known_files: Set[PurePath] = set()
self.new_files: Set[PurePath] = set()
self.changed_files: Set[PurePath] = set()
self.deleted_files: Set[PurePath] = set()
def mark_reserved(self, path: PurePath) -> None:
self.reserved_files.add(path)
def mark(self, path: PurePath) -> None:
"""
Mark a previously unknown file as known.
@ -58,12 +62,12 @@ class Report:
detail, see the respective exception's docstring.
"""
for known_path in self.known_files:
if path == known_path:
for other in self.known_files & self.reserved_files:
if path == other:
raise MarkDuplicateException(path)
if is_relative_to(path, known_path) or is_relative_to(known_path, path):
raise MarkConflictException(path, known_path)
if is_relative_to(path, other) or is_relative_to(other, path):
raise MarkConflictException(path, other)
self.known_files.add(path)

View File

@ -6,6 +6,8 @@ version = 3.0.0
packages = PFERD
python_requires = >=3.8
install_requires =
aiohttp>=3.7.4.post0
beautifulsoup4>=4.9.3
rich>=10.1.0
[options.entry_points]