Add 'skip' option to crawlers

This commit is contained in:
Joscha 2021-06-04 18:33:02 +02:00
parent fc31100a0f
commit df3ad3d890
6 changed files with 48 additions and 14 deletions

View File

@ -22,6 +22,9 @@ ambiguous situations.
## Unreleased ## Unreleased
### Added
- `skip` option for crawlers
### Changed ### Changed
- Use `/` instead of `\` as path separator for (regex) rules on Windows - Use `/` instead of `\` as path separator for (regex) rules on Windows

View File

@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are
common to all crawlers: common to all crawlers:
- `type`: The available types are specified in [this section](#crawler-types). - `type`: The available types are specified in [this section](#crawler-types).
- `skip`: Whether the crawler should be skipped during normal execution. The
crawler can still be executed manually using the `--crawler` or `-C` flags.
(Default: `no`)
- `output_dir`: The directory the crawler synchronizes files to. A crawler will - `output_dir`: The directory the crawler synchronizes files to. A crawler will
never place any files outside of this directory. (Default: the crawler's name) never place any files outside of this directory. (Default: the crawler's name)
- `redownload`: When to download a file that is already present locally. - `redownload`: When to download a file that is already present locally.

View File

@ -13,7 +13,11 @@ class AuthError(Exception):
class AuthSection(Section): class AuthSection(Section):
pass def type(self) -> str:
value = self.s.get("type")
if value is None:
self.missing_value("type")
return value
class Authenticator(ABC): class Authenticator(ABC):

View File

@ -3,7 +3,7 @@ from typing import Callable, Dict
from ..auth import Authenticator from ..auth import Authenticator
from ..config import Config from ..config import Config
from .crawler import Crawler, CrawlError # noqa: F401 from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
from .local_crawler import LocalCrawler, LocalCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection

View File

@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
class CrawlerSection(Section): class CrawlerSection(Section):
def type(self) -> str:
value = self.s.get("type")
if value is None:
self.missing_value("type")
return value
def skip(self) -> bool:
return self.s.getboolean("skip", fallback=False)
def output_dir(self, name: str) -> Path: def output_dir(self, name: str) -> Path:
# TODO Use removeprefix() after switching to 3.9 # TODO Use removeprefix() after switching to 3.9
if name.startswith("crawl:"): if name.startswith("crawl:"):

View File

@ -3,9 +3,9 @@ from typing import Dict, List, Optional
from rich.markup import escape from rich.markup import escape
from .auth import AUTHENTICATORS, Authenticator, AuthError from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection
from .config import Config, ConfigOptionError from .config import Config, ConfigOptionError
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler
from .logging import log from .logging import log
from .utils import fmt_path from .utils import fmt_path
@ -26,19 +26,22 @@ class Pferd:
self._authenticators: Dict[str, Authenticator] = {} self._authenticators: Dict[str, Authenticator] = {}
self._crawlers: Dict[str, Crawler] = {} self._crawlers: Dict[str, Crawler] = {}
def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: def _find_config_crawlers(self, config: Config) -> List[str]:
log.explain_topic("Deciding which crawlers to run") crawl_sections = []
crawl_sections = [name for name, _ in config.crawl_sections()]
for name, section in config.crawl_sections():
if CrawlerSection(section).skip():
log.explain(f"Skipping {name!r}")
else:
crawl_sections.append(name)
if cli_crawlers is None:
log.explain("No crawlers specified on CLI")
log.explain("Running all crawlers specified in config")
return crawl_sections return crawl_sections
def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]:
if len(cli_crawlers) != len(set(cli_crawlers)): if len(cli_crawlers) != len(set(cli_crawlers)):
raise PferdLoadError("Some crawlers were selected multiple times") raise PferdLoadError("Some crawlers were selected multiple times")
log.explain("Crawlers specified on CLI") crawl_sections = [name for name, _ in config.crawl_sections()]
crawlers_to_run = [] # With crawl: prefix crawlers_to_run = [] # With crawl: prefix
unknown_names = [] # Without crawl: prefix unknown_names = [] # Without crawl: prefix
@ -62,10 +65,22 @@ class Pferd:
return crawlers_to_run return crawlers_to_run
def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
log.explain_topic("Deciding which crawlers to run")
if cli_crawlers is None:
log.explain("No crawlers specified on CLI")
log.explain("Running crawlers specified in config")
return self._find_config_crawlers(config)
else:
log.explain("Crawlers specified on CLI")
return self._find_cli_crawlers(config, cli_crawlers)
def _load_authenticators(self) -> None: def _load_authenticators(self) -> None:
for name, section in self._config.auth_sections(): for name, section in self._config.auth_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
auth_type = section.get("type")
auth_type = AuthSection(section).type()
authenticator_constructor = AUTHENTICATORS.get(auth_type) authenticator_constructor = AUTHENTICATORS.get(auth_type)
if authenticator_constructor is None: if authenticator_constructor is None:
raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}")
@ -80,7 +95,7 @@ class Pferd:
for name, section in self._config.crawl_sections(): for name, section in self._config.crawl_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
crawl_type = section.get("type") crawl_type = CrawlerSection(section).type()
crawler_constructor = CRAWLERS.get(crawl_type) crawler_constructor = CRAWLERS.get(crawl_type)
if crawler_constructor is None: if crawler_constructor is None:
raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")