Add 'skip' option to crawlers

This commit is contained in:
Joscha 2021-06-04 18:33:02 +02:00
parent fc31100a0f
commit df3ad3d890
6 changed files with 48 additions and 14 deletions

View File

@ -22,6 +22,9 @@ ambiguous situations.
## Unreleased
### Added
- `skip` option for crawlers
### Changed
- Use `/` instead of `\` as path separator for (regex) rules on Windows

View File

@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are
common to all crawlers:
- `type`: The available types are specified in [this section](#crawler-types).
- `skip`: Whether the crawler should be skipped during normal execution. The
crawler can still be executed manually using the `--crawler` or `-C` flags.
(Default: `no`)
- `output_dir`: The directory the crawler synchronizes files to. A crawler will
never place any files outside of this directory. (Default: the crawler's name)
- `redownload`: When to download a file that is already present locally.

View File

@ -13,7 +13,11 @@ class AuthError(Exception):
class AuthSection(Section):
pass
def type(self) -> str:
value = self.s.get("type")
if value is None:
self.missing_value("type")
return value
class Authenticator(ABC):

View File

@ -3,7 +3,7 @@ from typing import Callable, Dict
from ..auth import Authenticator
from ..config import Config
from .crawler import Crawler, CrawlError # noqa: F401
from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
from .local_crawler import LocalCrawler, LocalCrawlerSection

View File

@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
class CrawlerSection(Section):
def type(self) -> str:
value = self.s.get("type")
if value is None:
self.missing_value("type")
return value
def skip(self) -> bool:
return self.s.getboolean("skip", fallback=False)
def output_dir(self, name: str) -> Path:
# TODO Use removeprefix() after switching to 3.9
if name.startswith("crawl:"):

View File

@ -3,9 +3,9 @@ from typing import Dict, List, Optional
from rich.markup import escape
from .auth import AUTHENTICATORS, Authenticator, AuthError
from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection
from .config import Config, ConfigOptionError
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler
from .logging import log
from .utils import fmt_path
@ -26,19 +26,22 @@ class Pferd:
self._authenticators: Dict[str, Authenticator] = {}
self._crawlers: Dict[str, Crawler] = {}
def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
log.explain_topic("Deciding which crawlers to run")
crawl_sections = [name for name, _ in config.crawl_sections()]
def _find_config_crawlers(self, config: Config) -> List[str]:
crawl_sections = []
for name, section in config.crawl_sections():
if CrawlerSection(section).skip():
log.explain(f"Skipping {name!r}")
else:
crawl_sections.append(name)
if cli_crawlers is None:
log.explain("No crawlers specified on CLI")
log.explain("Running all crawlers specified in config")
return crawl_sections
def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]:
if len(cli_crawlers) != len(set(cli_crawlers)):
raise PferdLoadError("Some crawlers were selected multiple times")
log.explain("Crawlers specified on CLI")
crawl_sections = [name for name, _ in config.crawl_sections()]
crawlers_to_run = [] # With crawl: prefix
unknown_names = [] # Without crawl: prefix
@ -62,10 +65,22 @@ class Pferd:
return crawlers_to_run
def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
log.explain_topic("Deciding which crawlers to run")
if cli_crawlers is None:
log.explain("No crawlers specified on CLI")
log.explain("Running crawlers specified in config")
return self._find_config_crawlers(config)
else:
log.explain("Crawlers specified on CLI")
return self._find_cli_crawlers(config, cli_crawlers)
def _load_authenticators(self) -> None:
for name, section in self._config.auth_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
auth_type = section.get("type")
auth_type = AuthSection(section).type()
authenticator_constructor = AUTHENTICATORS.get(auth_type)
if authenticator_constructor is None:
raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}")
@ -80,7 +95,7 @@ class Pferd:
for name, section in self._config.crawl_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
crawl_type = section.get("type")
crawl_type = CrawlerSection(section).type()
crawler_constructor = CRAWLERS.get(crawl_type)
if crawler_constructor is None:
raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")