mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-03 22:23:41 +01:00 
			
		
		
		
	Add 'skip' option to crawlers
This commit is contained in:
		@@ -22,6 +22,9 @@ ambiguous situations.
 | 
			
		||||
 | 
			
		||||
## Unreleased
 | 
			
		||||
 | 
			
		||||
### Added
 | 
			
		||||
- `skip` option for crawlers
 | 
			
		||||
 | 
			
		||||
### Changed
 | 
			
		||||
- Use `/` instead of `\` as path separator for (regex) rules on Windows
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are
 | 
			
		||||
common to all crawlers:
 | 
			
		||||
 | 
			
		||||
- `type`: The available types are specified in [this section](#crawler-types).
 | 
			
		||||
- `skip`: Whether the crawler should be skipped during normal execution. The
 | 
			
		||||
  crawler can still be executed manually using the `--crawler` or `-C` flags.
 | 
			
		||||
  (Default: `no`)
 | 
			
		||||
- `output_dir`: The directory the crawler synchronizes files to. A crawler will
 | 
			
		||||
  never place any files outside of this directory. (Default: the crawler's name)
 | 
			
		||||
- `redownload`: When to download a file that is already present locally.
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,11 @@ class AuthError(Exception):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AuthSection(Section):
 | 
			
		||||
    pass
 | 
			
		||||
    def type(self) -> str:
 | 
			
		||||
        value = self.s.get("type")
 | 
			
		||||
        if value is None:
 | 
			
		||||
            self.missing_value("type")
 | 
			
		||||
        return value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Authenticator(ABC):
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,7 @@ from typing import Callable, Dict
 | 
			
		||||
 | 
			
		||||
from ..auth import Authenticator
 | 
			
		||||
from ..config import Config
 | 
			
		||||
from .crawler import Crawler, CrawlError  # noqa: F401
 | 
			
		||||
from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401
 | 
			
		||||
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
 | 
			
		||||
from .local_crawler import LocalCrawler, LocalCrawlerSection
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CrawlerSection(Section):
 | 
			
		||||
    def type(self) -> str:
 | 
			
		||||
        value = self.s.get("type")
 | 
			
		||||
        if value is None:
 | 
			
		||||
            self.missing_value("type")
 | 
			
		||||
        return value
 | 
			
		||||
 | 
			
		||||
    def skip(self) -> bool:
 | 
			
		||||
        return self.s.getboolean("skip", fallback=False)
 | 
			
		||||
 | 
			
		||||
    def output_dir(self, name: str) -> Path:
 | 
			
		||||
        # TODO Use removeprefix() after switching to 3.9
 | 
			
		||||
        if name.startswith("crawl:"):
 | 
			
		||||
 
 | 
			
		||||
@@ -3,9 +3,9 @@ from typing import Dict, List, Optional
 | 
			
		||||
 | 
			
		||||
from rich.markup import escape
 | 
			
		||||
 | 
			
		||||
from .auth import AUTHENTICATORS, Authenticator, AuthError
 | 
			
		||||
from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection
 | 
			
		||||
from .config import Config, ConfigOptionError
 | 
			
		||||
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
 | 
			
		||||
from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler
 | 
			
		||||
from .logging import log
 | 
			
		||||
from .utils import fmt_path
 | 
			
		||||
 | 
			
		||||
@@ -26,19 +26,22 @@ class Pferd:
 | 
			
		||||
        self._authenticators: Dict[str, Authenticator] = {}
 | 
			
		||||
        self._crawlers: Dict[str, Crawler] = {}
 | 
			
		||||
 | 
			
		||||
    def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
 | 
			
		||||
        log.explain_topic("Deciding which crawlers to run")
 | 
			
		||||
        crawl_sections = [name for name, _ in config.crawl_sections()]
 | 
			
		||||
    def _find_config_crawlers(self, config: Config) -> List[str]:
 | 
			
		||||
        crawl_sections = []
 | 
			
		||||
 | 
			
		||||
        if cli_crawlers is None:
 | 
			
		||||
            log.explain("No crawlers specified on CLI")
 | 
			
		||||
            log.explain("Running all crawlers specified in config")
 | 
			
		||||
            return crawl_sections
 | 
			
		||||
        for name, section in config.crawl_sections():
 | 
			
		||||
            if CrawlerSection(section).skip():
 | 
			
		||||
                log.explain(f"Skipping {name!r}")
 | 
			
		||||
            else:
 | 
			
		||||
                crawl_sections.append(name)
 | 
			
		||||
 | 
			
		||||
        return crawl_sections
 | 
			
		||||
 | 
			
		||||
    def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]:
 | 
			
		||||
        if len(cli_crawlers) != len(set(cli_crawlers)):
 | 
			
		||||
            raise PferdLoadError("Some crawlers were selected multiple times")
 | 
			
		||||
 | 
			
		||||
        log.explain("Crawlers specified on CLI")
 | 
			
		||||
        crawl_sections = [name for name, _ in config.crawl_sections()]
 | 
			
		||||
 | 
			
		||||
        crawlers_to_run = []  # With crawl: prefix
 | 
			
		||||
        unknown_names = []  # Without crawl: prefix
 | 
			
		||||
@@ -62,10 +65,22 @@ class Pferd:
 | 
			
		||||
 | 
			
		||||
        return crawlers_to_run
 | 
			
		||||
 | 
			
		||||
    def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
 | 
			
		||||
        log.explain_topic("Deciding which crawlers to run")
 | 
			
		||||
 | 
			
		||||
        if cli_crawlers is None:
 | 
			
		||||
            log.explain("No crawlers specified on CLI")
 | 
			
		||||
            log.explain("Running crawlers specified in config")
 | 
			
		||||
            return self._find_config_crawlers(config)
 | 
			
		||||
        else:
 | 
			
		||||
            log.explain("Crawlers specified on CLI")
 | 
			
		||||
            return self._find_cli_crawlers(config, cli_crawlers)
 | 
			
		||||
 | 
			
		||||
    def _load_authenticators(self) -> None:
 | 
			
		||||
        for name, section in self._config.auth_sections():
 | 
			
		||||
            log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
 | 
			
		||||
            auth_type = section.get("type")
 | 
			
		||||
 | 
			
		||||
            auth_type = AuthSection(section).type()
 | 
			
		||||
            authenticator_constructor = AUTHENTICATORS.get(auth_type)
 | 
			
		||||
            if authenticator_constructor is None:
 | 
			
		||||
                raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}")
 | 
			
		||||
@@ -80,7 +95,7 @@ class Pferd:
 | 
			
		||||
        for name, section in self._config.crawl_sections():
 | 
			
		||||
            log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
 | 
			
		||||
 | 
			
		||||
            crawl_type = section.get("type")
 | 
			
		||||
            crawl_type = CrawlerSection(section).type()
 | 
			
		||||
            crawler_constructor = CRAWLERS.get(crawl_type)
 | 
			
		||||
            if crawler_constructor is None:
 | 
			
		||||
                raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user