From 91c33596daf267541a0f389de252c193d9c2c05e Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 30 Apr 2021 16:22:14 +0200 Subject: [PATCH] Load crawlers from config file --- PFERD/config.py | 21 +++++++++++++++++++-- PFERD/crawler.py | 8 +++++++- PFERD/pferd.py | 46 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index d71e4d1..d02900d 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -1,7 +1,7 @@ import configparser import os from pathlib import Path -from typing import Optional +from typing import List, Optional, Tuple from .utils import prompt_yes_no @@ -26,7 +26,6 @@ class Config: def __init__(self, parser: configparser.ConfigParser): self._parser = parser - # TODO Load and validate config into dataclasses @staticmethod def _fail_load(path: Path, reason: str) -> None: @@ -99,3 +98,21 @@ class Config: self._fail_dump(path, "That's a directory, not a file") except PermissionError: self._fail_dump(path, "Insufficient permissions") + + @property + def default_section(self) -> configparser.SectionProxy: + return self._parser[configparser.DEFAULTSECT] + + def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]: + result = [] + for section_name, section_proxy in self._parser.items(): + if section_name.startswith("crawler:"): + crawler_name = section_name[8:] + result.append((crawler_name, section_proxy)) + + return result + + @property + def working_dir(self) -> Path: + pathstr = self.default_section.get("working_dir", ".") + return Path(pathstr).expanduser() diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 9ceca20..6b1b350 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -8,6 +8,7 @@ from typing import AsyncContextManager, AsyncIterator, Optional from rich.markup import escape from .conductor import ProgressBar, TerminalConductor +from .config import Config from .limiter import Limiter from .transformer import RuleParseException, Transformer @@ -17,7 +18,12 @@ class CrawlerLoadException(Exception): class Crawler(ABC): - def __init__(self, name: str, section: configparser.SectionProxy) -> None: + def __init__( + self, + name: str, + config: Config, + section: configparser.SectionProxy, + ) -> None: """ Initialize a crawler from its name and its section in the config file. diff --git a/PFERD/pferd.py b/PFERD/pferd.py index d145ade..131ddc1 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,12 +1,52 @@ +from typing import Dict + +from rich import print +from rich.markup import escape + from .config import Config +from .crawler import Crawler from .crawlers import CRAWLERS +class PferdLoadException(Exception): + pass + + class Pferd: def __init__(self, config: Config): self._config = config + self._crawlers: Dict[str, Crawler] = {} + + def _load_crawlers(self) -> None: + abort = False + for name, section in self._config.crawler_sections(): + print(f"[bold bright_cyan]Loading[/] crawler:{escape(name)}") + crawler_type = section.get("type") + crawler_constructor = CRAWLERS.get(crawler_type) + if crawler_constructor is None: + abort = True + if crawler_type is None: + print("[red]Error: No type") + else: + t = escape(repr(crawler_type)) + print(f"[red]Error: Unknown type {t}") + continue + + crawler = crawler_constructor(name, self._config, section) + self._crawlers[name] = crawler + + if abort: + raise PferdLoadException() async def run(self) -> None: - print("Bleep bloop 1") - await CRAWLERS["dummy"]("dummy", self._config._parser["dummy"]).run() - print("Bleep bloop 2") + try: + self._load_crawlers() + except PferdLoadException: + print("[bold red]Could not initialize PFERD properly") + exit(1) + + for name, crawler in self._crawlers.items(): + print() + print(f"[bold bright_cyan]Running[/] crawler:{escape(name)}") + + await crawler.run()