diff --git a/CHANGELOG.md b/CHANGELOG.md index 980f96e..32cbe77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- `skip` option for crawlers + ### Changed - Use `/` instead of `\` as path separator for (regex) rules on Windows diff --git a/CONFIG.md b/CONFIG.md index feeade3..2f18be1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are common to all crawlers: - `type`: The available types are specified in [this section](#crawler-types). +- `skip`: Whether the crawler should be skipped during normal execution. The + crawler can still be executed manually using the `--crawler` or `-C` flags. + (Default: `no`) - `output_dir`: The directory the crawler synchronizes files to. A crawler will never place any files outside of this directory. (Default: the crawler's name) - `redownload`: When to download a file that is already present locally. diff --git a/PFERD/auth/authenticator.py b/PFERD/auth/authenticator.py index f588bc4..643a2d5 100644 --- a/PFERD/auth/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -13,7 +13,11 @@ class AuthError(Exception): class AuthSection(Section): - pass + def type(self) -> str: + value = self.s.get("type") + if value is None: + self.missing_value("type") + return value class Authenticator(ABC): diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 297c490..7eb2fb1 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -3,7 +3,7 @@ from typing import Callable, Dict from ..auth import Authenticator from ..config import Config -from .crawler import Crawler, CrawlError # noqa: F401 +from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index e990f16..d61783f 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): class CrawlerSection(Section): + def type(self) -> str: + value = self.s.get("type") + if value is None: + self.missing_value("type") + return value + + def skip(self) -> bool: + return self.s.getboolean("skip", fallback=False) + def output_dir(self, name: str) -> Path: # TODO Use removeprefix() after switching to 3.9 if name.startswith("crawl:"): diff --git a/PFERD/pferd.py b/PFERD/pferd.py index ac373cf..d98b426 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -3,9 +3,9 @@ from typing import Dict, List, Optional from rich.markup import escape -from .auth import AUTHENTICATORS, Authenticator, AuthError +from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection from .config import Config, ConfigOptionError -from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler +from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler from .logging import log from .utils import fmt_path @@ -26,19 +26,22 @@ class Pferd: self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} - def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: - log.explain_topic("Deciding which crawlers to run") - crawl_sections = [name for name, _ in config.crawl_sections()] + def _find_config_crawlers(self, config: Config) -> List[str]: + crawl_sections = [] - if cli_crawlers is None: - log.explain("No crawlers specified on CLI") - log.explain("Running all crawlers specified in config") - return crawl_sections + for name, section in config.crawl_sections(): + if CrawlerSection(section).skip(): + log.explain(f"Skipping {name!r}") + else: + crawl_sections.append(name) + return crawl_sections + + def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: if len(cli_crawlers) != len(set(cli_crawlers)): raise PferdLoadError("Some crawlers were selected multiple times") - log.explain("Crawlers specified on CLI") + crawl_sections = [name for name, _ in config.crawl_sections()] crawlers_to_run = [] # With crawl: prefix unknown_names = [] # Without crawl: prefix @@ -62,10 +65,22 @@ class Pferd: return crawlers_to_run + def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: + log.explain_topic("Deciding which crawlers to run") + + if cli_crawlers is None: + log.explain("No crawlers specified on CLI") + log.explain("Running crawlers specified in config") + return self._find_config_crawlers(config) + else: + log.explain("Crawlers specified on CLI") + return self._find_cli_crawlers(config, cli_crawlers) + def _load_authenticators(self) -> None: for name, section in self._config.auth_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - auth_type = section.get("type") + + auth_type = AuthSection(section).type() authenticator_constructor = AUTHENTICATORS.get(auth_type) if authenticator_constructor is None: raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") @@ -80,7 +95,7 @@ class Pferd: for name, section in self._config.crawl_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - crawl_type = section.get("type") + crawl_type = CrawlerSection(section).type() crawler_constructor = CRAWLERS.get(crawl_type) if crawler_constructor is None: raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")