diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 2578487..9bc2974 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -6,7 +6,7 @@ from pathlib import Path from .cli import PARSER, load_default_section from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log -from .pferd import Pferd +from .pferd import Pferd, PferdLoadError from .transformer import RuleParseError from .version import NAME, VERSION @@ -24,28 +24,10 @@ def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: args.command(args, parser) load_default_section(args, parser) - prune_crawlers(args, parser) return parser -def prune_crawlers( - args: argparse.Namespace, - parser: configparser.ConfigParser, -) -> None: - if not args.crawler: - return - - for section in parser.sections(): - if section.startswith("crawl:"): - # TODO Use removeprefix() when switching to 3.9 - name = section[len("crawl:"):] - if name not in args.crawler: - parser.remove_section(section) - - # TODO Check if crawlers actually exist - - def load_config(args: argparse.Namespace) -> Config: try: return Config(load_config_parser(args)) @@ -119,9 +101,9 @@ def main() -> None: exit() try: - pferd = Pferd(config) + pferd = Pferd(config, args.crawler) asyncio.run(pferd.run()) - except ConfigOptionError as e: + except (PferdLoadError, ConfigOptionError) as e: log.unlock() log.error(str(e)) exit(1) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 4aee043..75b0e9d 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, List, Optional from rich.markup import escape @@ -10,13 +10,22 @@ from .crawlers import CRAWLERS from .logging import log +class PferdLoadError(Exception): + pass + + class Pferd: - def __init__(self, config: Config): + def __init__(self, config: Config, crawlers_to_run: Optional[List[str]]): """ - May throw ConfigOptionError. + May throw PferdLoadError. """ + if crawlers_to_run is not None and len(crawlers_to_run) != len(set(crawlers_to_run)): + raise PferdLoadError("Some crawlers were selected multiple times") + self._config = config + self._crawlers_to_run = crawlers_to_run + self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} @@ -31,9 +40,13 @@ class Pferd: authenticator = authenticator_constructor(name, section, self._config) self._authenticators[name] = authenticator - def _load_crawlers(self) -> None: + def _load_crawlers(self) -> List[str]: + names = [] + for name, section in self._config.crawler_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") + names.append(name) + crawl_type = section.get("type") crawler_constructor = CRAWLERS.get(crawl_type) if crawler_constructor is None: @@ -42,15 +55,56 @@ class Pferd: crawler = crawler_constructor(name, section, self._config, self._authenticators) self._crawlers[name] = crawler + return names + + def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: + log.explain_topic("Deciding which crawlers to run") + + if self._crawlers_to_run is None: + log.explain("No crawlers specified on CLI") + log.explain("Running all loaded crawlers") + return loaded_crawlers + + log.explain("Crawlers specified on CLI") + + names: List[str] = [] # With 'crawl:' prefix + unknown_names = [] # Without 'crawl:' prefix + + for name in self._crawlers_to_run: + section_name = f"crawl:{name}" + if section_name in self._crawlers: + log.explain(f"Found crawler section named {section_name!r}") + names.append(section_name) + else: + log.explain(f"There's no crawler section named {section_name!r}") + unknown_names.append(name) + + if unknown_names: + if len(unknown_names) == 1: + [name] = unknown_names + raise PferdLoadError(f"There is no crawler named {name!r}") + else: + names_str = ", ".join(repr(name) for name in unknown_names) + raise PferdLoadError(f"There are no crawlers named {names_str}") + + return names + async def run(self) -> None: + """ + May throw PferdLoadError or ConfigOptionError. + """ + # These two functions must run inside the same event loop as the # crawlers, so that any new objects (like Conditions or Futures) can # obtain the correct event loop. self._load_authenticators() - self._load_crawlers() + loaded_crawlers = self._load_crawlers() + + log.print("") + + for name in self._find_crawlers_to_run(loaded_crawlers): + crawler = self._crawlers[name] - for name, crawler in self._crawlers.items(): - log.print("") log.print(f"[bold bright_cyan]Running[/] {escape(name)}") try: