Properly load crawler config

This commit is contained in:
Joscha 2021-05-05 23:45:10 +02:00
parent 5497dd2827
commit 273d56c39a
3 changed files with 80 additions and 19 deletions

View File

@ -1,7 +1,8 @@
import configparser
import os
from configparser import ConfigParser, SectionProxy
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple
from typing import Any, List, NoReturn, Optional, Tuple
from .utils import prompt_yes_no
@ -14,6 +15,27 @@ class ConfigDumpException(Exception):
pass
@dataclass
class ConfigFormatException(Exception):
section: str
key: str
desc: str
class Section:
def __init__(self, section: SectionProxy):
self.s = section
def error(self, key: str, desc: str) -> NoReturn:
raise ConfigFormatException(self.s.name, key, desc)
def invalid_value(self, key: str, value: Any) -> NoReturn:
self.error(key, f"Invalid value: {value!r}")
def missing_value(self, key: str) -> NoReturn:
self.error(key, "Missing value")
class Config:
@staticmethod
def _default_path() -> Path:
@ -24,7 +46,7 @@ class Config:
else:
return Path("~/.pferd.cfg").expanduser()
def __init__(self, parser: configparser.ConfigParser):
def __init__(self, parser: ConfigParser):
self._parser = parser
@staticmethod
@ -34,7 +56,7 @@ class Config:
raise ConfigLoadException()
@staticmethod
def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser:
def load_parser(path: Optional[Path] = None) -> ConfigParser:
"""
May throw a ConfigLoadException.
"""
@ -42,7 +64,7 @@ class Config:
if not path:
path = Config._default_path()
parser = configparser.ConfigParser()
parser = ConfigParser()
# Using config.read_file instead of config.read because config.read
# would just ignore a missing file and carry on.
@ -100,10 +122,10 @@ class Config:
self._fail_dump(path, "Insufficient permissions")
@property
def default_section(self) -> configparser.SectionProxy:
return self._parser[configparser.DEFAULTSECT]
def default_section(self) -> SectionProxy:
return self._parser[self._parser.default_section]
def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]:
def crawler_sections(self) -> List[Tuple[str, SectionProxy]]:
result = []
for section_name, section_proxy in self._parser.items():
if section_name.startswith("crawler:"):

View File

@ -1,7 +1,6 @@
import configparser
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from pathlib import PurePath
from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
Callable, Optional, Protocol, TypeVar)
@ -9,7 +8,7 @@ from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor
from .config import Config
from .config import Config, Section
from .limiter import Limiter
from .output_dir import OnConflict, OutputDirectory, Redownload
from .transformer import RuleParseException, Transformer
@ -91,12 +90,46 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
await f(self, *args, **kwargs)
return wrapper # type: ignore
return decorator
class CrawlerSection(Section):
def output_dir(self, name: str) -> Path:
return Path(self.s.get("output_dir", name))
def redownload(self) -> Redownload:
value = self.s.get("redownload", "never-smart")
if value == "never":
return Redownload.NEVER
elif value == "never-smart":
return Redownload.NEVER_SMART
elif value == "always":
return Redownload.ALWAYS
elif value == "always-smart":
return Redownload.ALWAYS_SMART
self.invalid_value("redownload", value)
def on_conflict(self) -> OnConflict:
value = self.s.get("on_conflict", "prompt")
if value == "prompt":
return OnConflict.PROMPT
elif value == "local-first":
return OnConflict.LOCAL_FIRST
elif value == "remote-first":
return OnConflict.REMOTE_FIRST
elif value == "no-delete":
return OnConflict.NO_DELETE
self.invalid_value("on_conflict", value)
def transform(self) -> str:
return self.s.get("transform", "")
class Crawler(ABC):
def __init__(
self,
name: str,
config: Config,
section: configparser.SectionProxy,
section: CrawlerSection,
) -> None:
"""
Initialize a crawler from its name and its section in the config file.
@ -113,16 +146,17 @@ class Crawler(ABC):
self._limiter = Limiter()
try:
self._transformer = Transformer(section.get("transform", ""))
self._transformer = Transformer(section.transform())
except RuleParseException as e:
e.pretty_print()
raise CrawlerLoadException()
output_dir = config.working_dir / section.get("output_dir", name)
redownload = Redownload.NEVER_SMART
on_conflict = OnConflict.PROMPT
self._output_dir = OutputDirectory(
output_dir, redownload, on_conflict, self._conductor)
config.working_dir / section.output_dir(name),
section.redownload(),
section.on_conflict(),
self._conductor,
)
self._error_free = False

View File

@ -1,5 +1,10 @@
from configparser import SectionProxy
from typing import Callable, Dict
from ..config import Config
from ..crawler import Crawler, CrawlerSection
from .dummy import DummyCrawler
CRAWLERS = {
"dummy": DummyCrawler,
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
"dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
}