From 9e12e96d90a7c5b76544c8379b289510902469ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julius=20R=C3=BCberg?= <22551563+Toorero@users.noreply.github.com> Date: Tue, 2 Nov 2021 03:42:08 +0100 Subject: [PATCH] Added alias functionality --- CONFIG.md | 4 ++++ PFERD/pferd.py | 24 +++++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 8ccaa50..0775eb1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -86,6 +86,9 @@ common to all crawlers: load for the crawl target. (Default: `0.0`) - `windows_paths`: Whether PFERD should find alternative names for paths that are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) +- `alias`: List of strings that are considered as an alias when invoking with + the `--crawler` or `-C` flag. If there is more then one crawl section with + the same alias all are selected. Thereby you can group different crawlers. Some crawlers may also require credentials for authentication. To configure how the crawler obtains its credentials, the `auth` option is used. It is set to the @@ -100,6 +103,7 @@ username = foo password = bar [crawl:something] +alias = [sth, some] type = some-complex-crawler auth = auth:example on_conflict = no-delete diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 726ed45..6ed31a0 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Set from rich.markup import escape @@ -43,16 +43,22 @@ class Pferd: crawl_sections = [name for name, _ in config.crawl_sections()] - crawlers_to_run = [] # With crawl: prefix + crawlers_to_run = set() # With crawl: prefix unknown_names = [] # Without crawl: prefix for name in cli_crawlers: section_name = f"crawl:{name}" if section_name in crawl_sections: log.explain(f"Crawler section named {section_name!r} exists") - crawlers_to_run.append(section_name) - else: - log.explain(f"There's no crawler section named {section_name!r}") + crawlers_to_run.add(section_name) + alias_names = self._find_crawlers_by_alias(name, config) + if alias_names: + crawlers_to_run.update(alias_names) + log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:") + for alias_name in alias_names: + log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists") + if not section_name in crawl_sections and not alias_names: + log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.") unknown_names.append(name) if unknown_names: @@ -65,6 +71,14 @@ class Pferd: return crawlers_to_run + def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]: + alias_names = set() + for (section_name, section) in config.crawl_sections(): + section_alias = section.get("alias", []) + if alias in section_alias: + alias_names.add(section_name) + return alias_names + def _find_crawlers_to_run( self, config: Config,