Improve specifying crawlers via CLI

Instead of removing the sections of unselected crawlers from the config file,
crawler selection now happens in the Pferd after loading the crawlers and is
more sophisticated. It also has better error messages.
This commit is contained in:
Joscha 2021-05-23 18:16:25 +02:00
parent 59f13bb8d6
commit a9af56a5e9
2 changed files with 64 additions and 28 deletions

View File

@ -6,7 +6,7 @@ from pathlib import Path
from .cli import PARSER, load_default_section
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
from .logging import log
from .pferd import Pferd
from .pferd import Pferd, PferdLoadError
from .transformer import RuleParseError
from .version import NAME, VERSION
@ -24,28 +24,10 @@ def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser:
args.command(args, parser)
load_default_section(args, parser)
prune_crawlers(args, parser)
return parser
def prune_crawlers(
args: argparse.Namespace,
parser: configparser.ConfigParser,
) -> None:
if not args.crawler:
return
for section in parser.sections():
if section.startswith("crawl:"):
# TODO Use removeprefix() when switching to 3.9
name = section[len("crawl:"):]
if name not in args.crawler:
parser.remove_section(section)
# TODO Check if crawlers actually exist
def load_config(args: argparse.Namespace) -> Config:
try:
return Config(load_config_parser(args))
@ -119,9 +101,9 @@ def main() -> None:
exit()
try:
pferd = Pferd(config)
pferd = Pferd(config, args.crawler)
asyncio.run(pferd.run())
except ConfigOptionError as e:
except (PferdLoadError, ConfigOptionError) as e:
log.unlock()
log.error(str(e))
exit(1)

View File

@ -1,4 +1,4 @@
from typing import Dict
from typing import Dict, List, Optional
from rich.markup import escape
@ -10,13 +10,22 @@ from .crawlers import CRAWLERS
from .logging import log
class PferdLoadError(Exception):
pass
class Pferd:
def __init__(self, config: Config):
def __init__(self, config: Config, crawlers_to_run: Optional[List[str]]):
"""
May throw ConfigOptionError.
May throw PferdLoadError.
"""
if crawlers_to_run is not None and len(crawlers_to_run) != len(set(crawlers_to_run)):
raise PferdLoadError("Some crawlers were selected multiple times")
self._config = config
self._crawlers_to_run = crawlers_to_run
self._authenticators: Dict[str, Authenticator] = {}
self._crawlers: Dict[str, Crawler] = {}
@ -31,9 +40,13 @@ class Pferd:
authenticator = authenticator_constructor(name, section, self._config)
self._authenticators[name] = authenticator
def _load_crawlers(self) -> None:
def _load_crawlers(self) -> List[str]:
names = []
for name, section in self._config.crawler_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
names.append(name)
crawl_type = section.get("type")
crawler_constructor = CRAWLERS.get(crawl_type)
if crawler_constructor is None:
@ -42,15 +55,56 @@ class Pferd:
crawler = crawler_constructor(name, section, self._config, self._authenticators)
self._crawlers[name] = crawler
return names
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]:
log.explain_topic("Deciding which crawlers to run")
if self._crawlers_to_run is None:
log.explain("No crawlers specified on CLI")
log.explain("Running all loaded crawlers")
return loaded_crawlers
log.explain("Crawlers specified on CLI")
names: List[str] = [] # With 'crawl:' prefix
unknown_names = [] # Without 'crawl:' prefix
for name in self._crawlers_to_run:
section_name = f"crawl:{name}"
if section_name in self._crawlers:
log.explain(f"Found crawler section named {section_name!r}")
names.append(section_name)
else:
log.explain(f"There's no crawler section named {section_name!r}")
unknown_names.append(name)
if unknown_names:
if len(unknown_names) == 1:
[name] = unknown_names
raise PferdLoadError(f"There is no crawler named {name!r}")
else:
names_str = ", ".join(repr(name) for name in unknown_names)
raise PferdLoadError(f"There are no crawlers named {names_str}")
return names
async def run(self) -> None:
"""
May throw PferdLoadError or ConfigOptionError.
"""
# These two functions must run inside the same event loop as the
# crawlers, so that any new objects (like Conditions or Futures) can
# obtain the correct event loop.
self._load_authenticators()
self._load_crawlers()
loaded_crawlers = self._load_crawlers()
for name, crawler in self._crawlers.items():
log.print("")
for name in self._find_crawlers_to_run(loaded_crawlers):
crawler = self._crawlers[name]
log.print(f"[bold bright_cyan]Running[/] {escape(name)}")
try: