Properly load crawler config

This commit is contained in:
Joscha 2021-05-05 23:45:10 +02:00
parent 5497dd2827
commit 273d56c39a
3 changed files with 80 additions and 19 deletions

View File

@ -1,7 +1,8 @@
import configparser
import os import os
from configparser import ConfigParser, SectionProxy
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import List, Optional, Tuple from typing import Any, List, NoReturn, Optional, Tuple
from .utils import prompt_yes_no from .utils import prompt_yes_no
@ -14,6 +15,27 @@ class ConfigDumpException(Exception):
pass pass
@dataclass
class ConfigFormatException(Exception):
section: str
key: str
desc: str
class Section:
def __init__(self, section: SectionProxy):
self.s = section
def error(self, key: str, desc: str) -> NoReturn:
raise ConfigFormatException(self.s.name, key, desc)
def invalid_value(self, key: str, value: Any) -> NoReturn:
self.error(key, f"Invalid value: {value!r}")
def missing_value(self, key: str) -> NoReturn:
self.error(key, "Missing value")
class Config: class Config:
@staticmethod @staticmethod
def _default_path() -> Path: def _default_path() -> Path:
@ -24,7 +46,7 @@ class Config:
else: else:
return Path("~/.pferd.cfg").expanduser() return Path("~/.pferd.cfg").expanduser()
def __init__(self, parser: configparser.ConfigParser): def __init__(self, parser: ConfigParser):
self._parser = parser self._parser = parser
@staticmethod @staticmethod
@ -34,7 +56,7 @@ class Config:
raise ConfigLoadException() raise ConfigLoadException()
@staticmethod @staticmethod
def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser: def load_parser(path: Optional[Path] = None) -> ConfigParser:
""" """
May throw a ConfigLoadException. May throw a ConfigLoadException.
""" """
@ -42,7 +64,7 @@ class Config:
if not path: if not path:
path = Config._default_path() path = Config._default_path()
parser = configparser.ConfigParser() parser = ConfigParser()
# Using config.read_file instead of config.read because config.read # Using config.read_file instead of config.read because config.read
# would just ignore a missing file and carry on. # would just ignore a missing file and carry on.
@ -100,10 +122,10 @@ class Config:
self._fail_dump(path, "Insufficient permissions") self._fail_dump(path, "Insufficient permissions")
@property @property
def default_section(self) -> configparser.SectionProxy: def default_section(self) -> SectionProxy:
return self._parser[configparser.DEFAULTSECT] return self._parser[self._parser.default_section]
def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]: def crawler_sections(self) -> List[Tuple[str, SectionProxy]]:
result = [] result = []
for section_name, section_proxy in self._parser.items(): for section_name, section_proxy in self._parser.items():
if section_name.startswith("crawler:"): if section_name.startswith("crawler:"):

View File

@ -1,7 +1,6 @@
import configparser
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from pathlib import PurePath from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated # TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
Callable, Optional, Protocol, TypeVar) Callable, Optional, Protocol, TypeVar)
@ -9,7 +8,7 @@ from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
from rich.markup import escape from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor from .conductor import ProgressBar, TerminalConductor
from .config import Config from .config import Config, Section
from .limiter import Limiter from .limiter import Limiter
from .output_dir import OnConflict, OutputDirectory, Redownload from .output_dir import OnConflict, OutputDirectory, Redownload
from .transformer import RuleParseException, Transformer from .transformer import RuleParseException, Transformer
@ -91,12 +90,46 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
await f(self, *args, **kwargs) await f(self, *args, **kwargs)
return wrapper # type: ignore return wrapper # type: ignore
return decorator return decorator
class CrawlerSection(Section):
def output_dir(self, name: str) -> Path:
return Path(self.s.get("output_dir", name))
def redownload(self) -> Redownload:
value = self.s.get("redownload", "never-smart")
if value == "never":
return Redownload.NEVER
elif value == "never-smart":
return Redownload.NEVER_SMART
elif value == "always":
return Redownload.ALWAYS
elif value == "always-smart":
return Redownload.ALWAYS_SMART
self.invalid_value("redownload", value)
def on_conflict(self) -> OnConflict:
value = self.s.get("on_conflict", "prompt")
if value == "prompt":
return OnConflict.PROMPT
elif value == "local-first":
return OnConflict.LOCAL_FIRST
elif value == "remote-first":
return OnConflict.REMOTE_FIRST
elif value == "no-delete":
return OnConflict.NO_DELETE
self.invalid_value("on_conflict", value)
def transform(self) -> str:
return self.s.get("transform", "")
class Crawler(ABC): class Crawler(ABC):
def __init__( def __init__(
self, self,
name: str, name: str,
config: Config, config: Config,
section: configparser.SectionProxy, section: CrawlerSection,
) -> None: ) -> None:
""" """
Initialize a crawler from its name and its section in the config file. Initialize a crawler from its name and its section in the config file.
@ -113,16 +146,17 @@ class Crawler(ABC):
self._limiter = Limiter() self._limiter = Limiter()
try: try:
self._transformer = Transformer(section.get("transform", "")) self._transformer = Transformer(section.transform())
except RuleParseException as e: except RuleParseException as e:
e.pretty_print() e.pretty_print()
raise CrawlerLoadException() raise CrawlerLoadException()
output_dir = config.working_dir / section.get("output_dir", name)
redownload = Redownload.NEVER_SMART
on_conflict = OnConflict.PROMPT
self._output_dir = OutputDirectory( self._output_dir = OutputDirectory(
output_dir, redownload, on_conflict, self._conductor) config.working_dir / section.output_dir(name),
section.redownload(),
section.on_conflict(),
self._conductor,
)
self._error_free = False self._error_free = False

View File

@ -1,5 +1,10 @@
from configparser import SectionProxy
from typing import Callable, Dict
from ..config import Config
from ..crawler import Crawler, CrawlerSection
from .dummy import DummyCrawler from .dummy import DummyCrawler
CRAWLERS = { CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
"dummy": DummyCrawler, "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
} }