mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-03 22:23:41 +01:00 
			
		
		
		
	Properly load crawler config
This commit is contained in:
		@@ -1,7 +1,8 @@
 | 
			
		||||
import configparser
 | 
			
		||||
import os
 | 
			
		||||
from configparser import ConfigParser, SectionProxy
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import List, Optional, Tuple
 | 
			
		||||
from typing import Any, List, NoReturn, Optional, Tuple
 | 
			
		||||
 | 
			
		||||
from .utils import prompt_yes_no
 | 
			
		||||
 | 
			
		||||
@@ -14,6 +15,27 @@ class ConfigDumpException(Exception):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class ConfigFormatException(Exception):
 | 
			
		||||
    section: str
 | 
			
		||||
    key: str
 | 
			
		||||
    desc: str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Section:
 | 
			
		||||
    def __init__(self, section: SectionProxy):
 | 
			
		||||
        self.s = section
 | 
			
		||||
 | 
			
		||||
    def error(self, key: str, desc: str) -> NoReturn:
 | 
			
		||||
        raise ConfigFormatException(self.s.name, key, desc)
 | 
			
		||||
 | 
			
		||||
    def invalid_value(self, key: str, value: Any) -> NoReturn:
 | 
			
		||||
        self.error(key, f"Invalid value: {value!r}")
 | 
			
		||||
 | 
			
		||||
    def missing_value(self, key: str) -> NoReturn:
 | 
			
		||||
        self.error(key, "Missing value")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Config:
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _default_path() -> Path:
 | 
			
		||||
@@ -24,7 +46,7 @@ class Config:
 | 
			
		||||
        else:
 | 
			
		||||
            return Path("~/.pferd.cfg").expanduser()
 | 
			
		||||
 | 
			
		||||
    def __init__(self, parser: configparser.ConfigParser):
 | 
			
		||||
    def __init__(self, parser: ConfigParser):
 | 
			
		||||
        self._parser = parser
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
@@ -34,7 +56,7 @@ class Config:
 | 
			
		||||
        raise ConfigLoadException()
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser:
 | 
			
		||||
    def load_parser(path: Optional[Path] = None) -> ConfigParser:
 | 
			
		||||
        """
 | 
			
		||||
        May throw a ConfigLoadException.
 | 
			
		||||
        """
 | 
			
		||||
@@ -42,7 +64,7 @@ class Config:
 | 
			
		||||
        if not path:
 | 
			
		||||
            path = Config._default_path()
 | 
			
		||||
 | 
			
		||||
        parser = configparser.ConfigParser()
 | 
			
		||||
        parser = ConfigParser()
 | 
			
		||||
 | 
			
		||||
        # Using config.read_file instead of config.read because config.read
 | 
			
		||||
        # would just ignore a missing file and carry on.
 | 
			
		||||
@@ -100,10 +122,10 @@ class Config:
 | 
			
		||||
            self._fail_dump(path, "Insufficient permissions")
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def default_section(self) -> configparser.SectionProxy:
 | 
			
		||||
        return self._parser[configparser.DEFAULTSECT]
 | 
			
		||||
    def default_section(self) -> SectionProxy:
 | 
			
		||||
        return self._parser[self._parser.default_section]
 | 
			
		||||
 | 
			
		||||
    def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]:
 | 
			
		||||
    def crawler_sections(self) -> List[Tuple[str, SectionProxy]]:
 | 
			
		||||
        result = []
 | 
			
		||||
        for section_name, section_proxy in self._parser.items():
 | 
			
		||||
            if section_name.startswith("crawler:"):
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,6 @@
 | 
			
		||||
import configparser
 | 
			
		||||
from abc import ABC, abstractmethod
 | 
			
		||||
from contextlib import asynccontextmanager
 | 
			
		||||
from pathlib import PurePath
 | 
			
		||||
from pathlib import Path, PurePath
 | 
			
		||||
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
 | 
			
		||||
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
 | 
			
		||||
                    Callable, Optional, Protocol, TypeVar)
 | 
			
		||||
@@ -9,7 +8,7 @@ from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
 | 
			
		||||
from rich.markup import escape
 | 
			
		||||
 | 
			
		||||
from .conductor import ProgressBar, TerminalConductor
 | 
			
		||||
from .config import Config
 | 
			
		||||
from .config import Config, Section
 | 
			
		||||
from .limiter import Limiter
 | 
			
		||||
from .output_dir import OnConflict, OutputDirectory, Redownload
 | 
			
		||||
from .transformer import RuleParseException, Transformer
 | 
			
		||||
@@ -91,12 +90,46 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
 | 
			
		||||
            await f(self, *args, **kwargs)
 | 
			
		||||
        return wrapper  # type: ignore
 | 
			
		||||
    return decorator
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CrawlerSection(Section):
 | 
			
		||||
    def output_dir(self, name: str) -> Path:
 | 
			
		||||
        return Path(self.s.get("output_dir", name))
 | 
			
		||||
 | 
			
		||||
    def redownload(self) -> Redownload:
 | 
			
		||||
        value = self.s.get("redownload", "never-smart")
 | 
			
		||||
        if value == "never":
 | 
			
		||||
            return Redownload.NEVER
 | 
			
		||||
        elif value == "never-smart":
 | 
			
		||||
            return Redownload.NEVER_SMART
 | 
			
		||||
        elif value == "always":
 | 
			
		||||
            return Redownload.ALWAYS
 | 
			
		||||
        elif value == "always-smart":
 | 
			
		||||
            return Redownload.ALWAYS_SMART
 | 
			
		||||
        self.invalid_value("redownload", value)
 | 
			
		||||
 | 
			
		||||
    def on_conflict(self) -> OnConflict:
 | 
			
		||||
        value = self.s.get("on_conflict", "prompt")
 | 
			
		||||
        if value == "prompt":
 | 
			
		||||
            return OnConflict.PROMPT
 | 
			
		||||
        elif value == "local-first":
 | 
			
		||||
            return OnConflict.LOCAL_FIRST
 | 
			
		||||
        elif value == "remote-first":
 | 
			
		||||
            return OnConflict.REMOTE_FIRST
 | 
			
		||||
        elif value == "no-delete":
 | 
			
		||||
            return OnConflict.NO_DELETE
 | 
			
		||||
        self.invalid_value("on_conflict", value)
 | 
			
		||||
 | 
			
		||||
    def transform(self) -> str:
 | 
			
		||||
        return self.s.get("transform", "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Crawler(ABC):
 | 
			
		||||
    def __init__(
 | 
			
		||||
            self,
 | 
			
		||||
            name: str,
 | 
			
		||||
            config: Config,
 | 
			
		||||
            section: configparser.SectionProxy,
 | 
			
		||||
            section: CrawlerSection,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """
 | 
			
		||||
        Initialize a crawler from its name and its section in the config file.
 | 
			
		||||
@@ -113,16 +146,17 @@ class Crawler(ABC):
 | 
			
		||||
        self._limiter = Limiter()
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self._transformer = Transformer(section.get("transform", ""))
 | 
			
		||||
            self._transformer = Transformer(section.transform())
 | 
			
		||||
        except RuleParseException as e:
 | 
			
		||||
            e.pretty_print()
 | 
			
		||||
            raise CrawlerLoadException()
 | 
			
		||||
 | 
			
		||||
        output_dir = config.working_dir / section.get("output_dir", name)
 | 
			
		||||
        redownload = Redownload.NEVER_SMART
 | 
			
		||||
        on_conflict = OnConflict.PROMPT
 | 
			
		||||
        self._output_dir = OutputDirectory(
 | 
			
		||||
            output_dir, redownload, on_conflict, self._conductor)
 | 
			
		||||
            config.working_dir / section.output_dir(name),
 | 
			
		||||
            section.redownload(),
 | 
			
		||||
            section.on_conflict(),
 | 
			
		||||
            self._conductor,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self._error_free = False
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,10 @@
 | 
			
		||||
from configparser import SectionProxy
 | 
			
		||||
from typing import Callable, Dict
 | 
			
		||||
 | 
			
		||||
from ..config import Config
 | 
			
		||||
from ..crawler import Crawler, CrawlerSection
 | 
			
		||||
from .dummy import DummyCrawler
 | 
			
		||||
 | 
			
		||||
CRAWLERS = {
 | 
			
		||||
    "dummy": DummyCrawler,
 | 
			
		||||
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
 | 
			
		||||
    "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user