2021-05-24 13:10:19 +02:00
|
|
|
from pathlib import Path
|
2021-05-23 18:16:25 +02:00
|
|
|
from typing import Dict, List, Optional
|
2021-04-30 16:22:14 +02:00
|
|
|
|
|
|
|
from rich.markup import escape
|
|
|
|
|
2021-06-04 18:33:02 +02:00
|
|
|
from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection
|
2021-05-22 21:05:32 +02:00
|
|
|
from .config import Config, ConfigOptionError
|
2021-06-04 18:33:02 +02:00
|
|
|
from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler
|
2021-05-22 21:05:32 +02:00
|
|
|
from .logging import log
|
2021-05-23 21:13:06 +02:00
|
|
|
from .utils import fmt_path
|
2021-04-30 16:22:14 +02:00
|
|
|
|
|
|
|
|
2021-05-23 18:16:25 +02:00
|
|
|
class PferdLoadError(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2021-04-29 13:45:04 +02:00
|
|
|
class Pferd:
|
2021-07-01 11:01:55 +02:00
|
|
|
def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]):
|
2021-05-22 21:05:32 +02:00
|
|
|
"""
|
2021-05-23 18:16:25 +02:00
|
|
|
May throw PferdLoadError.
|
2021-05-22 21:05:32 +02:00
|
|
|
"""
|
|
|
|
|
2021-04-29 13:45:04 +02:00
|
|
|
self._config = config
|
2021-07-01 11:01:55 +02:00
|
|
|
self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips)
|
2021-05-23 18:16:25 +02:00
|
|
|
|
2021-05-11 00:27:43 +02:00
|
|
|
self._authenticators: Dict[str, Authenticator] = {}
|
2021-04-30 16:22:14 +02:00
|
|
|
self._crawlers: Dict[str, Crawler] = {}
|
|
|
|
|
2021-06-04 18:33:02 +02:00
|
|
|
def _find_config_crawlers(self, config: Config) -> List[str]:
|
|
|
|
crawl_sections = []
|
2021-05-25 15:35:36 +02:00
|
|
|
|
2021-06-04 18:33:02 +02:00
|
|
|
for name, section in config.crawl_sections():
|
|
|
|
if CrawlerSection(section).skip():
|
|
|
|
log.explain(f"Skipping {name!r}")
|
|
|
|
else:
|
|
|
|
crawl_sections.append(name)
|
2021-05-25 15:35:36 +02:00
|
|
|
|
2021-06-04 18:33:02 +02:00
|
|
|
return crawl_sections
|
|
|
|
|
|
|
|
def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]:
|
2021-05-25 15:35:36 +02:00
|
|
|
if len(cli_crawlers) != len(set(cli_crawlers)):
|
|
|
|
raise PferdLoadError("Some crawlers were selected multiple times")
|
|
|
|
|
2021-06-04 18:33:02 +02:00
|
|
|
crawl_sections = [name for name, _ in config.crawl_sections()]
|
2021-05-25 15:35:36 +02:00
|
|
|
|
|
|
|
crawlers_to_run = [] # With crawl: prefix
|
|
|
|
unknown_names = [] # Without crawl: prefix
|
|
|
|
|
|
|
|
for name in cli_crawlers:
|
|
|
|
section_name = f"crawl:{name}"
|
|
|
|
if section_name in crawl_sections:
|
|
|
|
log.explain(f"Crawler section named {section_name!r} exists")
|
|
|
|
crawlers_to_run.append(section_name)
|
|
|
|
else:
|
|
|
|
log.explain(f"There's no crawler section named {section_name!r}")
|
|
|
|
unknown_names.append(name)
|
|
|
|
|
|
|
|
if unknown_names:
|
|
|
|
if len(unknown_names) == 1:
|
|
|
|
[name] = unknown_names
|
|
|
|
raise PferdLoadError(f"There is no crawler named {name!r}")
|
|
|
|
else:
|
|
|
|
names_str = ", ".join(repr(name) for name in unknown_names)
|
|
|
|
raise PferdLoadError(f"There are no crawlers named {names_str}")
|
|
|
|
|
|
|
|
return crawlers_to_run
|
|
|
|
|
2021-07-01 11:01:55 +02:00
|
|
|
def _find_crawlers_to_run(
|
|
|
|
self,
|
|
|
|
config: Config,
|
|
|
|
cli_crawlers: Optional[List[str]],
|
|
|
|
cli_skips: Optional[List[str]],
|
|
|
|
) -> List[str]:
|
2021-06-04 18:33:02 +02:00
|
|
|
log.explain_topic("Deciding which crawlers to run")
|
|
|
|
|
2021-07-01 11:01:55 +02:00
|
|
|
crawlers: List[str]
|
2021-06-04 18:33:02 +02:00
|
|
|
if cli_crawlers is None:
|
|
|
|
log.explain("No crawlers specified on CLI")
|
|
|
|
log.explain("Running crawlers specified in config")
|
2021-07-01 11:01:55 +02:00
|
|
|
crawlers = self._find_config_crawlers(config)
|
2021-06-04 18:33:02 +02:00
|
|
|
else:
|
|
|
|
log.explain("Crawlers specified on CLI")
|
2021-07-01 11:01:55 +02:00
|
|
|
crawlers = self._find_cli_crawlers(config, cli_crawlers)
|
|
|
|
|
|
|
|
skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set()
|
|
|
|
for crawler in crawlers:
|
|
|
|
if crawler in skips:
|
|
|
|
log.explain(f"Skipping crawler {crawler!r}")
|
|
|
|
crawlers = [crawler for crawler in crawlers if crawler not in skips]
|
|
|
|
|
|
|
|
return crawlers
|
2021-06-04 18:33:02 +02:00
|
|
|
|
2021-05-11 00:27:43 +02:00
|
|
|
def _load_authenticators(self) -> None:
|
2021-05-25 15:49:06 +02:00
|
|
|
for name, section in self._config.auth_sections():
|
2021-05-22 21:05:32 +02:00
|
|
|
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
2021-06-04 18:33:02 +02:00
|
|
|
|
|
|
|
auth_type = AuthSection(section).type()
|
2021-05-22 21:05:32 +02:00
|
|
|
authenticator_constructor = AUTHENTICATORS.get(auth_type)
|
2021-05-11 00:27:43 +02:00
|
|
|
if authenticator_constructor is None:
|
2021-05-22 21:05:32 +02:00
|
|
|
raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}")
|
2021-05-11 00:27:43 +02:00
|
|
|
|
2021-05-18 22:43:46 +02:00
|
|
|
authenticator = authenticator_constructor(name, section, self._config)
|
2021-05-11 00:27:43 +02:00
|
|
|
self._authenticators[name] = authenticator
|
|
|
|
|
2021-05-25 15:35:36 +02:00
|
|
|
def _load_crawlers(self) -> None:
|
2021-05-24 13:10:19 +02:00
|
|
|
# Cookie sharing
|
|
|
|
kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
|
|
|
|
|
2021-05-25 15:49:06 +02:00
|
|
|
for name, section in self._config.crawl_sections():
|
2021-05-22 21:05:32 +02:00
|
|
|
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
2021-05-23 18:16:25 +02:00
|
|
|
|
2021-06-04 18:33:02 +02:00
|
|
|
crawl_type = CrawlerSection(section).type()
|
2021-05-22 21:05:32 +02:00
|
|
|
crawler_constructor = CRAWLERS.get(crawl_type)
|
2021-04-30 16:22:14 +02:00
|
|
|
if crawler_constructor is None:
|
2021-05-22 21:05:32 +02:00
|
|
|
raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")
|
2021-04-30 16:22:14 +02:00
|
|
|
|
2021-05-18 22:43:46 +02:00
|
|
|
crawler = crawler_constructor(name, section, self._config, self._authenticators)
|
2021-04-30 16:22:14 +02:00
|
|
|
self._crawlers[name] = crawler
|
|
|
|
|
2021-05-24 13:10:19 +02:00
|
|
|
if self._config.default_section.share_cookies():
|
|
|
|
if isinstance(crawler, KitIliasWebCrawler):
|
|
|
|
crawler.share_cookies(kit_ilias_web_paths)
|
|
|
|
|
2021-05-26 11:37:32 +02:00
|
|
|
def debug_transforms(self) -> None:
|
|
|
|
for name in self._crawlers_to_run:
|
|
|
|
crawler = self._crawlers[name]
|
|
|
|
log.print("")
|
|
|
|
log.print(f"[bold bright_cyan]Debugging transforms[/] for {escape(name)}")
|
|
|
|
crawler.debug_transforms()
|
|
|
|
|
|
|
|
async def run(self, debug_transforms: bool) -> None:
|
2021-05-23 18:16:25 +02:00
|
|
|
"""
|
2021-05-25 15:35:36 +02:00
|
|
|
May throw ConfigOptionError.
|
2021-05-23 18:16:25 +02:00
|
|
|
"""
|
|
|
|
|
2021-05-22 20:25:58 +00:00
|
|
|
# These two functions must run inside the same event loop as the
|
|
|
|
# crawlers, so that any new objects (like Conditions or Futures) can
|
|
|
|
# obtain the correct event loop.
|
|
|
|
self._load_authenticators()
|
2021-05-25 15:35:36 +02:00
|
|
|
self._load_crawlers()
|
2021-05-23 18:16:25 +02:00
|
|
|
|
2021-05-26 11:37:32 +02:00
|
|
|
if debug_transforms:
|
|
|
|
log.output_explain = True
|
|
|
|
log.output_report = False
|
|
|
|
self.debug_transforms()
|
|
|
|
return
|
|
|
|
|
2021-05-23 18:16:25 +02:00
|
|
|
log.print("")
|
|
|
|
|
2021-05-25 15:35:36 +02:00
|
|
|
for name in self._crawlers_to_run:
|
2021-05-23 18:16:25 +02:00
|
|
|
crawler = self._crawlers[name]
|
2021-05-22 20:25:58 +00:00
|
|
|
|
2021-05-22 21:05:32 +02:00
|
|
|
log.print(f"[bold bright_cyan]Running[/] {escape(name)}")
|
|
|
|
|
|
|
|
try:
|
|
|
|
await crawler.run()
|
2021-05-25 14:21:12 +02:00
|
|
|
except (CrawlError, AuthError) as e:
|
2021-05-22 21:05:32 +02:00
|
|
|
log.error(str(e))
|
|
|
|
except Exception:
|
|
|
|
log.unexpected_exception()
|
2021-05-23 21:13:06 +02:00
|
|
|
|
2021-05-25 15:35:36 +02:00
|
|
|
def print_report(self) -> None:
|
|
|
|
for name in self._crawlers_to_run:
|
2021-05-25 15:42:46 +02:00
|
|
|
crawler = self._crawlers.get(name)
|
|
|
|
if crawler is None:
|
|
|
|
continue # Crawler failed to load
|
2021-05-23 21:13:06 +02:00
|
|
|
|
|
|
|
log.report("")
|
|
|
|
log.report(f"[bold bright_cyan]Report[/] for {escape(name)}")
|
|
|
|
|
2021-05-24 13:17:28 +02:00
|
|
|
something_changed = False
|
2021-05-23 21:13:06 +02:00
|
|
|
for path in sorted(crawler.report.added_files):
|
2021-05-24 13:17:28 +02:00
|
|
|
something_changed = True
|
2021-05-23 21:13:06 +02:00
|
|
|
log.report(f" [bold bright_green]Added[/] {fmt_path(path)}")
|
|
|
|
for path in sorted(crawler.report.changed_files):
|
2021-05-24 13:17:28 +02:00
|
|
|
something_changed = True
|
2021-05-23 21:13:06 +02:00
|
|
|
log.report(f" [bold bright_yellow]Changed[/] {fmt_path(path)}")
|
|
|
|
for path in sorted(crawler.report.deleted_files):
|
2021-05-24 13:17:28 +02:00
|
|
|
something_changed = True
|
2021-05-23 21:13:06 +02:00
|
|
|
log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}")
|
2021-05-26 10:58:19 +02:00
|
|
|
for path in sorted(crawler.report.not_deleted_files):
|
|
|
|
something_changed = True
|
|
|
|
log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}")
|
2021-05-23 21:13:06 +02:00
|
|
|
|
2021-05-24 13:17:28 +02:00
|
|
|
if not something_changed:
|
|
|
|
log.report(" Nothing changed")
|