From 273d56c39a8440aca743188ddb56e7c50a4f109d Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 5 May 2021 23:45:10 +0200 Subject: [PATCH] Properly load crawler config --- PFERD/config.py | 38 ++++++++++++++++++++++------ PFERD/crawler.py | 52 +++++++++++++++++++++++++++++++------- PFERD/crawlers/__init__.py | 9 +++++-- 3 files changed, 80 insertions(+), 19 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index d02900d..f2abe8d 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -1,7 +1,8 @@ -import configparser import os +from configparser import ConfigParser, SectionProxy +from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, List, NoReturn, Optional, Tuple from .utils import prompt_yes_no @@ -14,6 +15,27 @@ class ConfigDumpException(Exception): pass +@dataclass +class ConfigFormatException(Exception): + section: str + key: str + desc: str + + +class Section: + def __init__(self, section: SectionProxy): + self.s = section + + def error(self, key: str, desc: str) -> NoReturn: + raise ConfigFormatException(self.s.name, key, desc) + + def invalid_value(self, key: str, value: Any) -> NoReturn: + self.error(key, f"Invalid value: {value!r}") + + def missing_value(self, key: str) -> NoReturn: + self.error(key, "Missing value") + + class Config: @staticmethod def _default_path() -> Path: @@ -24,7 +46,7 @@ class Config: else: return Path("~/.pferd.cfg").expanduser() - def __init__(self, parser: configparser.ConfigParser): + def __init__(self, parser: ConfigParser): self._parser = parser @staticmethod @@ -34,7 +56,7 @@ class Config: raise ConfigLoadException() @staticmethod - def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser: + def load_parser(path: Optional[Path] = None) -> ConfigParser: """ May throw a ConfigLoadException. """ @@ -42,7 +64,7 @@ class Config: if not path: path = Config._default_path() - parser = configparser.ConfigParser() + parser = ConfigParser() # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. @@ -100,10 +122,10 @@ class Config: self._fail_dump(path, "Insufficient permissions") @property - def default_section(self) -> configparser.SectionProxy: - return self._parser[configparser.DEFAULTSECT] + def default_section(self) -> SectionProxy: + return self._parser[self._parser.default_section] - def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]: + def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for section_name, section_proxy in self._parser.items(): if section_name.startswith("crawler:"): diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4cb48a9..ff779ab 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,7 +1,6 @@ -import configparser from abc import ABC, abstractmethod from contextlib import asynccontextmanager -from pathlib import PurePath +from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Optional, Protocol, TypeVar) @@ -9,7 +8,7 @@ from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, from rich.markup import escape from .conductor import ProgressBar, TerminalConductor -from .config import Config +from .config import Config, Section from .limiter import Limiter from .output_dir import OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer @@ -91,12 +90,46 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: await f(self, *args, **kwargs) return wrapper # type: ignore return decorator + + +class CrawlerSection(Section): + def output_dir(self, name: str) -> Path: + return Path(self.s.get("output_dir", name)) + + def redownload(self) -> Redownload: + value = self.s.get("redownload", "never-smart") + if value == "never": + return Redownload.NEVER + elif value == "never-smart": + return Redownload.NEVER_SMART + elif value == "always": + return Redownload.ALWAYS + elif value == "always-smart": + return Redownload.ALWAYS_SMART + self.invalid_value("redownload", value) + + def on_conflict(self) -> OnConflict: + value = self.s.get("on_conflict", "prompt") + if value == "prompt": + return OnConflict.PROMPT + elif value == "local-first": + return OnConflict.LOCAL_FIRST + elif value == "remote-first": + return OnConflict.REMOTE_FIRST + elif value == "no-delete": + return OnConflict.NO_DELETE + self.invalid_value("on_conflict", value) + + def transform(self) -> str: + return self.s.get("transform", "") + + class Crawler(ABC): def __init__( self, name: str, config: Config, - section: configparser.SectionProxy, + section: CrawlerSection, ) -> None: """ Initialize a crawler from its name and its section in the config file. @@ -113,16 +146,17 @@ class Crawler(ABC): self._limiter = Limiter() try: - self._transformer = Transformer(section.get("transform", "")) + self._transformer = Transformer(section.transform()) except RuleParseException as e: e.pretty_print() raise CrawlerLoadException() - output_dir = config.working_dir / section.get("output_dir", name) - redownload = Redownload.NEVER_SMART - on_conflict = OnConflict.PROMPT self._output_dir = OutputDirectory( - output_dir, redownload, on_conflict, self._conductor) + config.working_dir / section.output_dir(name), + section.redownload(), + section.on_conflict(), + self._conductor, + ) self._error_free = False diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 5248a2d..69dac39 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -1,5 +1,10 @@ +from configparser import SectionProxy +from typing import Callable, Dict + +from ..config import Config +from ..crawler import Crawler, CrawlerSection from .dummy import DummyCrawler -CRAWLERS = { - "dummy": DummyCrawler, +CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { + "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)), }