mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Improve specifying crawlers via CLI
Instead of removing the sections of unselected crawlers from the config file, crawler selection now happens in the Pferd after loading the crawlers and is more sophisticated. It also has better error messages.
This commit is contained in:
parent
59f13bb8d6
commit
a9af56a5e9
@ -6,7 +6,7 @@ from pathlib import Path
|
|||||||
from .cli import PARSER, load_default_section
|
from .cli import PARSER, load_default_section
|
||||||
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
||||||
from .logging import log
|
from .logging import log
|
||||||
from .pferd import Pferd
|
from .pferd import Pferd, PferdLoadError
|
||||||
from .transformer import RuleParseError
|
from .transformer import RuleParseError
|
||||||
from .version import NAME, VERSION
|
from .version import NAME, VERSION
|
||||||
|
|
||||||
@ -24,28 +24,10 @@ def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser:
|
|||||||
args.command(args, parser)
|
args.command(args, parser)
|
||||||
|
|
||||||
load_default_section(args, parser)
|
load_default_section(args, parser)
|
||||||
prune_crawlers(args, parser)
|
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def prune_crawlers(
|
|
||||||
args: argparse.Namespace,
|
|
||||||
parser: configparser.ConfigParser,
|
|
||||||
) -> None:
|
|
||||||
if not args.crawler:
|
|
||||||
return
|
|
||||||
|
|
||||||
for section in parser.sections():
|
|
||||||
if section.startswith("crawl:"):
|
|
||||||
# TODO Use removeprefix() when switching to 3.9
|
|
||||||
name = section[len("crawl:"):]
|
|
||||||
if name not in args.crawler:
|
|
||||||
parser.remove_section(section)
|
|
||||||
|
|
||||||
# TODO Check if crawlers actually exist
|
|
||||||
|
|
||||||
|
|
||||||
def load_config(args: argparse.Namespace) -> Config:
|
def load_config(args: argparse.Namespace) -> Config:
|
||||||
try:
|
try:
|
||||||
return Config(load_config_parser(args))
|
return Config(load_config_parser(args))
|
||||||
@ -119,9 +101,9 @@ def main() -> None:
|
|||||||
exit()
|
exit()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pferd = Pferd(config)
|
pferd = Pferd(config, args.crawler)
|
||||||
asyncio.run(pferd.run())
|
asyncio.run(pferd.run())
|
||||||
except ConfigOptionError as e:
|
except (PferdLoadError, ConfigOptionError) as e:
|
||||||
log.unlock()
|
log.unlock()
|
||||||
log.error(str(e))
|
log.error(str(e))
|
||||||
exit(1)
|
exit(1)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Dict
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
@ -10,13 +10,22 @@ from .crawlers import CRAWLERS
|
|||||||
from .logging import log
|
from .logging import log
|
||||||
|
|
||||||
|
|
||||||
|
class PferdLoadError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Pferd:
|
class Pferd:
|
||||||
def __init__(self, config: Config):
|
def __init__(self, config: Config, crawlers_to_run: Optional[List[str]]):
|
||||||
"""
|
"""
|
||||||
May throw ConfigOptionError.
|
May throw PferdLoadError.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if crawlers_to_run is not None and len(crawlers_to_run) != len(set(crawlers_to_run)):
|
||||||
|
raise PferdLoadError("Some crawlers were selected multiple times")
|
||||||
|
|
||||||
self._config = config
|
self._config = config
|
||||||
|
self._crawlers_to_run = crawlers_to_run
|
||||||
|
|
||||||
self._authenticators: Dict[str, Authenticator] = {}
|
self._authenticators: Dict[str, Authenticator] = {}
|
||||||
self._crawlers: Dict[str, Crawler] = {}
|
self._crawlers: Dict[str, Crawler] = {}
|
||||||
|
|
||||||
@ -31,9 +40,13 @@ class Pferd:
|
|||||||
authenticator = authenticator_constructor(name, section, self._config)
|
authenticator = authenticator_constructor(name, section, self._config)
|
||||||
self._authenticators[name] = authenticator
|
self._authenticators[name] = authenticator
|
||||||
|
|
||||||
def _load_crawlers(self) -> None:
|
def _load_crawlers(self) -> List[str]:
|
||||||
|
names = []
|
||||||
|
|
||||||
for name, section in self._config.crawler_sections():
|
for name, section in self._config.crawler_sections():
|
||||||
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
||||||
|
names.append(name)
|
||||||
|
|
||||||
crawl_type = section.get("type")
|
crawl_type = section.get("type")
|
||||||
crawler_constructor = CRAWLERS.get(crawl_type)
|
crawler_constructor = CRAWLERS.get(crawl_type)
|
||||||
if crawler_constructor is None:
|
if crawler_constructor is None:
|
||||||
@ -42,15 +55,56 @@ class Pferd:
|
|||||||
crawler = crawler_constructor(name, section, self._config, self._authenticators)
|
crawler = crawler_constructor(name, section, self._config, self._authenticators)
|
||||||
self._crawlers[name] = crawler
|
self._crawlers[name] = crawler
|
||||||
|
|
||||||
|
return names
|
||||||
|
|
||||||
|
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]:
|
||||||
|
log.explain_topic("Deciding which crawlers to run")
|
||||||
|
|
||||||
|
if self._crawlers_to_run is None:
|
||||||
|
log.explain("No crawlers specified on CLI")
|
||||||
|
log.explain("Running all loaded crawlers")
|
||||||
|
return loaded_crawlers
|
||||||
|
|
||||||
|
log.explain("Crawlers specified on CLI")
|
||||||
|
|
||||||
|
names: List[str] = [] # With 'crawl:' prefix
|
||||||
|
unknown_names = [] # Without 'crawl:' prefix
|
||||||
|
|
||||||
|
for name in self._crawlers_to_run:
|
||||||
|
section_name = f"crawl:{name}"
|
||||||
|
if section_name in self._crawlers:
|
||||||
|
log.explain(f"Found crawler section named {section_name!r}")
|
||||||
|
names.append(section_name)
|
||||||
|
else:
|
||||||
|
log.explain(f"There's no crawler section named {section_name!r}")
|
||||||
|
unknown_names.append(name)
|
||||||
|
|
||||||
|
if unknown_names:
|
||||||
|
if len(unknown_names) == 1:
|
||||||
|
[name] = unknown_names
|
||||||
|
raise PferdLoadError(f"There is no crawler named {name!r}")
|
||||||
|
else:
|
||||||
|
names_str = ", ".join(repr(name) for name in unknown_names)
|
||||||
|
raise PferdLoadError(f"There are no crawlers named {names_str}")
|
||||||
|
|
||||||
|
return names
|
||||||
|
|
||||||
async def run(self) -> None:
|
async def run(self) -> None:
|
||||||
|
"""
|
||||||
|
May throw PferdLoadError or ConfigOptionError.
|
||||||
|
"""
|
||||||
|
|
||||||
# These two functions must run inside the same event loop as the
|
# These two functions must run inside the same event loop as the
|
||||||
# crawlers, so that any new objects (like Conditions or Futures) can
|
# crawlers, so that any new objects (like Conditions or Futures) can
|
||||||
# obtain the correct event loop.
|
# obtain the correct event loop.
|
||||||
self._load_authenticators()
|
self._load_authenticators()
|
||||||
self._load_crawlers()
|
loaded_crawlers = self._load_crawlers()
|
||||||
|
|
||||||
for name, crawler in self._crawlers.items():
|
|
||||||
log.print("")
|
log.print("")
|
||||||
|
|
||||||
|
for name in self._find_crawlers_to_run(loaded_crawlers):
|
||||||
|
crawler = self._crawlers[name]
|
||||||
|
|
||||||
log.print(f"[bold bright_cyan]Running[/] {escape(name)}")
|
log.print(f"[bold bright_cyan]Running[/] {escape(name)}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
Loading…
Reference in New Issue
Block a user