mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Properly load crawler config
This commit is contained in:
parent
5497dd2827
commit
273d56c39a
@ -1,7 +1,8 @@
|
|||||||
import configparser
|
|
||||||
import os
|
import os
|
||||||
|
from configparser import ConfigParser, SectionProxy
|
||||||
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Tuple
|
from typing import Any, List, NoReturn, Optional, Tuple
|
||||||
|
|
||||||
from .utils import prompt_yes_no
|
from .utils import prompt_yes_no
|
||||||
|
|
||||||
@ -14,6 +15,27 @@ class ConfigDumpException(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ConfigFormatException(Exception):
|
||||||
|
section: str
|
||||||
|
key: str
|
||||||
|
desc: str
|
||||||
|
|
||||||
|
|
||||||
|
class Section:
|
||||||
|
def __init__(self, section: SectionProxy):
|
||||||
|
self.s = section
|
||||||
|
|
||||||
|
def error(self, key: str, desc: str) -> NoReturn:
|
||||||
|
raise ConfigFormatException(self.s.name, key, desc)
|
||||||
|
|
||||||
|
def invalid_value(self, key: str, value: Any) -> NoReturn:
|
||||||
|
self.error(key, f"Invalid value: {value!r}")
|
||||||
|
|
||||||
|
def missing_value(self, key: str) -> NoReturn:
|
||||||
|
self.error(key, "Missing value")
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _default_path() -> Path:
|
def _default_path() -> Path:
|
||||||
@ -24,7 +46,7 @@ class Config:
|
|||||||
else:
|
else:
|
||||||
return Path("~/.pferd.cfg").expanduser()
|
return Path("~/.pferd.cfg").expanduser()
|
||||||
|
|
||||||
def __init__(self, parser: configparser.ConfigParser):
|
def __init__(self, parser: ConfigParser):
|
||||||
self._parser = parser
|
self._parser = parser
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -34,7 +56,7 @@ class Config:
|
|||||||
raise ConfigLoadException()
|
raise ConfigLoadException()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser:
|
def load_parser(path: Optional[Path] = None) -> ConfigParser:
|
||||||
"""
|
"""
|
||||||
May throw a ConfigLoadException.
|
May throw a ConfigLoadException.
|
||||||
"""
|
"""
|
||||||
@ -42,7 +64,7 @@ class Config:
|
|||||||
if not path:
|
if not path:
|
||||||
path = Config._default_path()
|
path = Config._default_path()
|
||||||
|
|
||||||
parser = configparser.ConfigParser()
|
parser = ConfigParser()
|
||||||
|
|
||||||
# Using config.read_file instead of config.read because config.read
|
# Using config.read_file instead of config.read because config.read
|
||||||
# would just ignore a missing file and carry on.
|
# would just ignore a missing file and carry on.
|
||||||
@ -100,10 +122,10 @@ class Config:
|
|||||||
self._fail_dump(path, "Insufficient permissions")
|
self._fail_dump(path, "Insufficient permissions")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_section(self) -> configparser.SectionProxy:
|
def default_section(self) -> SectionProxy:
|
||||||
return self._parser[configparser.DEFAULTSECT]
|
return self._parser[self._parser.default_section]
|
||||||
|
|
||||||
def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]:
|
def crawler_sections(self) -> List[Tuple[str, SectionProxy]]:
|
||||||
result = []
|
result = []
|
||||||
for section_name, section_proxy in self._parser.items():
|
for section_name, section_proxy in self._parser.items():
|
||||||
if section_name.startswith("crawler:"):
|
if section_name.startswith("crawler:"):
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import configparser
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from pathlib import PurePath
|
from pathlib import Path, PurePath
|
||||||
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
||||||
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
|
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
|
||||||
Callable, Optional, Protocol, TypeVar)
|
Callable, Optional, Protocol, TypeVar)
|
||||||
@ -9,7 +8,7 @@ from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
|
|||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
from .conductor import ProgressBar, TerminalConductor
|
from .conductor import ProgressBar, TerminalConductor
|
||||||
from .config import Config
|
from .config import Config, Section
|
||||||
from .limiter import Limiter
|
from .limiter import Limiter
|
||||||
from .output_dir import OnConflict, OutputDirectory, Redownload
|
from .output_dir import OnConflict, OutputDirectory, Redownload
|
||||||
from .transformer import RuleParseException, Transformer
|
from .transformer import RuleParseException, Transformer
|
||||||
@ -91,12 +90,46 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
|
|||||||
await f(self, *args, **kwargs)
|
await f(self, *args, **kwargs)
|
||||||
return wrapper # type: ignore
|
return wrapper # type: ignore
|
||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerSection(Section):
|
||||||
|
def output_dir(self, name: str) -> Path:
|
||||||
|
return Path(self.s.get("output_dir", name))
|
||||||
|
|
||||||
|
def redownload(self) -> Redownload:
|
||||||
|
value = self.s.get("redownload", "never-smart")
|
||||||
|
if value == "never":
|
||||||
|
return Redownload.NEVER
|
||||||
|
elif value == "never-smart":
|
||||||
|
return Redownload.NEVER_SMART
|
||||||
|
elif value == "always":
|
||||||
|
return Redownload.ALWAYS
|
||||||
|
elif value == "always-smart":
|
||||||
|
return Redownload.ALWAYS_SMART
|
||||||
|
self.invalid_value("redownload", value)
|
||||||
|
|
||||||
|
def on_conflict(self) -> OnConflict:
|
||||||
|
value = self.s.get("on_conflict", "prompt")
|
||||||
|
if value == "prompt":
|
||||||
|
return OnConflict.PROMPT
|
||||||
|
elif value == "local-first":
|
||||||
|
return OnConflict.LOCAL_FIRST
|
||||||
|
elif value == "remote-first":
|
||||||
|
return OnConflict.REMOTE_FIRST
|
||||||
|
elif value == "no-delete":
|
||||||
|
return OnConflict.NO_DELETE
|
||||||
|
self.invalid_value("on_conflict", value)
|
||||||
|
|
||||||
|
def transform(self) -> str:
|
||||||
|
return self.s.get("transform", "")
|
||||||
|
|
||||||
|
|
||||||
class Crawler(ABC):
|
class Crawler(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
config: Config,
|
config: Config,
|
||||||
section: configparser.SectionProxy,
|
section: CrawlerSection,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize a crawler from its name and its section in the config file.
|
Initialize a crawler from its name and its section in the config file.
|
||||||
@ -113,16 +146,17 @@ class Crawler(ABC):
|
|||||||
self._limiter = Limiter()
|
self._limiter = Limiter()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._transformer = Transformer(section.get("transform", ""))
|
self._transformer = Transformer(section.transform())
|
||||||
except RuleParseException as e:
|
except RuleParseException as e:
|
||||||
e.pretty_print()
|
e.pretty_print()
|
||||||
raise CrawlerLoadException()
|
raise CrawlerLoadException()
|
||||||
|
|
||||||
output_dir = config.working_dir / section.get("output_dir", name)
|
|
||||||
redownload = Redownload.NEVER_SMART
|
|
||||||
on_conflict = OnConflict.PROMPT
|
|
||||||
self._output_dir = OutputDirectory(
|
self._output_dir = OutputDirectory(
|
||||||
output_dir, redownload, on_conflict, self._conductor)
|
config.working_dir / section.output_dir(name),
|
||||||
|
section.redownload(),
|
||||||
|
section.on_conflict(),
|
||||||
|
self._conductor,
|
||||||
|
)
|
||||||
|
|
||||||
self._error_free = False
|
self._error_free = False
|
||||||
|
|
||||||
|
@ -1,5 +1,10 @@
|
|||||||
|
from configparser import SectionProxy
|
||||||
|
from typing import Callable, Dict
|
||||||
|
|
||||||
|
from ..config import Config
|
||||||
|
from ..crawler import Crawler, CrawlerSection
|
||||||
from .dummy import DummyCrawler
|
from .dummy import DummyCrawler
|
||||||
|
|
||||||
CRAWLERS = {
|
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
|
||||||
"dummy": DummyCrawler,
|
"dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user