mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add 'skip' option to crawlers
This commit is contained in:
parent
fc31100a0f
commit
df3ad3d890
@ -22,6 +22,9 @@ ambiguous situations.
|
||||
|
||||
## Unreleased
|
||||
|
||||
### Added
|
||||
- `skip` option for crawlers
|
||||
|
||||
### Changed
|
||||
- Use `/` instead of `\` as path separator for (regex) rules on Windows
|
||||
|
||||
|
@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are
|
||||
common to all crawlers:
|
||||
|
||||
- `type`: The available types are specified in [this section](#crawler-types).
|
||||
- `skip`: Whether the crawler should be skipped during normal execution. The
|
||||
crawler can still be executed manually using the `--crawler` or `-C` flags.
|
||||
(Default: `no`)
|
||||
- `output_dir`: The directory the crawler synchronizes files to. A crawler will
|
||||
never place any files outside of this directory. (Default: the crawler's name)
|
||||
- `redownload`: When to download a file that is already present locally.
|
||||
|
@ -13,7 +13,11 @@ class AuthError(Exception):
|
||||
|
||||
|
||||
class AuthSection(Section):
|
||||
pass
|
||||
def type(self) -> str:
|
||||
value = self.s.get("type")
|
||||
if value is None:
|
||||
self.missing_value("type")
|
||||
return value
|
||||
|
||||
|
||||
class Authenticator(ABC):
|
||||
|
@ -3,7 +3,7 @@ from typing import Callable, Dict
|
||||
|
||||
from ..auth import Authenticator
|
||||
from ..config import Config
|
||||
from .crawler import Crawler, CrawlError # noqa: F401
|
||||
from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
|
||||
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
|
||||
from .local_crawler import LocalCrawler, LocalCrawlerSection
|
||||
|
||||
|
@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
|
||||
|
||||
|
||||
class CrawlerSection(Section):
|
||||
def type(self) -> str:
|
||||
value = self.s.get("type")
|
||||
if value is None:
|
||||
self.missing_value("type")
|
||||
return value
|
||||
|
||||
def skip(self) -> bool:
|
||||
return self.s.getboolean("skip", fallback=False)
|
||||
|
||||
def output_dir(self, name: str) -> Path:
|
||||
# TODO Use removeprefix() after switching to 3.9
|
||||
if name.startswith("crawl:"):
|
||||
|
@ -3,9 +3,9 @@ from typing import Dict, List, Optional
|
||||
|
||||
from rich.markup import escape
|
||||
|
||||
from .auth import AUTHENTICATORS, Authenticator, AuthError
|
||||
from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection
|
||||
from .config import Config, ConfigOptionError
|
||||
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
|
||||
from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler
|
||||
from .logging import log
|
||||
from .utils import fmt_path
|
||||
|
||||
@ -26,19 +26,22 @@ class Pferd:
|
||||
self._authenticators: Dict[str, Authenticator] = {}
|
||||
self._crawlers: Dict[str, Crawler] = {}
|
||||
|
||||
def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
|
||||
log.explain_topic("Deciding which crawlers to run")
|
||||
crawl_sections = [name for name, _ in config.crawl_sections()]
|
||||
def _find_config_crawlers(self, config: Config) -> List[str]:
|
||||
crawl_sections = []
|
||||
|
||||
if cli_crawlers is None:
|
||||
log.explain("No crawlers specified on CLI")
|
||||
log.explain("Running all crawlers specified in config")
|
||||
return crawl_sections
|
||||
for name, section in config.crawl_sections():
|
||||
if CrawlerSection(section).skip():
|
||||
log.explain(f"Skipping {name!r}")
|
||||
else:
|
||||
crawl_sections.append(name)
|
||||
|
||||
return crawl_sections
|
||||
|
||||
def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]:
|
||||
if len(cli_crawlers) != len(set(cli_crawlers)):
|
||||
raise PferdLoadError("Some crawlers were selected multiple times")
|
||||
|
||||
log.explain("Crawlers specified on CLI")
|
||||
crawl_sections = [name for name, _ in config.crawl_sections()]
|
||||
|
||||
crawlers_to_run = [] # With crawl: prefix
|
||||
unknown_names = [] # Without crawl: prefix
|
||||
@ -62,10 +65,22 @@ class Pferd:
|
||||
|
||||
return crawlers_to_run
|
||||
|
||||
def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
|
||||
log.explain_topic("Deciding which crawlers to run")
|
||||
|
||||
if cli_crawlers is None:
|
||||
log.explain("No crawlers specified on CLI")
|
||||
log.explain("Running crawlers specified in config")
|
||||
return self._find_config_crawlers(config)
|
||||
else:
|
||||
log.explain("Crawlers specified on CLI")
|
||||
return self._find_cli_crawlers(config, cli_crawlers)
|
||||
|
||||
def _load_authenticators(self) -> None:
|
||||
for name, section in self._config.auth_sections():
|
||||
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
||||
auth_type = section.get("type")
|
||||
|
||||
auth_type = AuthSection(section).type()
|
||||
authenticator_constructor = AUTHENTICATORS.get(auth_type)
|
||||
if authenticator_constructor is None:
|
||||
raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}")
|
||||
@ -80,7 +95,7 @@ class Pferd:
|
||||
for name, section in self._config.crawl_sections():
|
||||
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
||||
|
||||
crawl_type = section.get("type")
|
||||
crawl_type = CrawlerSection(section).type()
|
||||
crawler_constructor = CRAWLERS.get(crawl_type)
|
||||
if crawler_constructor is None:
|
||||
raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")
|
||||
|
Loading…
Reference in New Issue
Block a user