pferd/PFERD/pferd.py

144 lines
5.2 KiB
Python
Raw Normal View History

2021-05-24 13:10:19 +02:00
from pathlib import Path
from typing import Dict, List, Optional
2021-04-30 16:22:14 +02:00
from rich.markup import escape
from .auth import AUTHENTICATORS, Authenticator, AuthError
2021-05-22 21:05:32 +02:00
from .config import Config, ConfigOptionError
2021-05-24 13:10:19 +02:00
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
2021-05-22 21:05:32 +02:00
from .logging import log
from .utils import fmt_path
2021-04-30 16:22:14 +02:00
class PferdLoadError(Exception):
pass
2021-04-29 13:45:04 +02:00
class Pferd:
def __init__(self, config: Config, crawlers_to_run: Optional[List[str]]):
2021-05-22 21:05:32 +02:00
"""
May throw PferdLoadError.
2021-05-22 21:05:32 +02:00
"""
if crawlers_to_run is not None and len(crawlers_to_run) != len(set(crawlers_to_run)):
raise PferdLoadError("Some crawlers were selected multiple times")
2021-04-29 13:45:04 +02:00
self._config = config
self._crawlers_to_run = crawlers_to_run
self._authenticators: Dict[str, Authenticator] = {}
2021-04-30 16:22:14 +02:00
self._crawlers: Dict[str, Crawler] = {}
def _load_authenticators(self) -> None:
for name, section in self._config.authenticator_sections():
2021-05-22 21:05:32 +02:00
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
auth_type = section.get("type")
authenticator_constructor = AUTHENTICATORS.get(auth_type)
if authenticator_constructor is None:
2021-05-22 21:05:32 +02:00
raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}")
authenticator = authenticator_constructor(name, section, self._config)
self._authenticators[name] = authenticator
def _load_crawlers(self) -> List[str]:
names = []
2021-05-24 13:10:19 +02:00
# Cookie sharing
kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
2021-04-30 16:22:14 +02:00
for name, section in self._config.crawler_sections():
2021-05-22 21:05:32 +02:00
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
names.append(name)
2021-05-22 21:05:32 +02:00
crawl_type = section.get("type")
crawler_constructor = CRAWLERS.get(crawl_type)
2021-04-30 16:22:14 +02:00
if crawler_constructor is None:
2021-05-22 21:05:32 +02:00
raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}")
2021-04-30 16:22:14 +02:00
crawler = crawler_constructor(name, section, self._config, self._authenticators)
2021-04-30 16:22:14 +02:00
self._crawlers[name] = crawler
2021-05-24 13:10:19 +02:00
if self._config.default_section.share_cookies():
if isinstance(crawler, KitIliasWebCrawler):
crawler.share_cookies(kit_ilias_web_paths)
return names
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]:
log.explain_topic("Deciding which crawlers to run")
if self._crawlers_to_run is None:
log.explain("No crawlers specified on CLI")
log.explain("Running all loaded crawlers")
return loaded_crawlers
log.explain("Crawlers specified on CLI")
names: List[str] = [] # With 'crawl:' prefix
unknown_names = [] # Without 'crawl:' prefix
for name in self._crawlers_to_run:
section_name = f"crawl:{name}"
if section_name in self._crawlers:
2021-05-24 13:17:28 +02:00
log.explain(f"Crawler section named {section_name!r} exists")
names.append(section_name)
else:
log.explain(f"There's no crawler section named {section_name!r}")
unknown_names.append(name)
if unknown_names:
if len(unknown_names) == 1:
[name] = unknown_names
raise PferdLoadError(f"There is no crawler named {name!r}")
else:
names_str = ", ".join(repr(name) for name in unknown_names)
raise PferdLoadError(f"There are no crawlers named {names_str}")
return names
2021-04-29 13:45:04 +02:00
async def run(self) -> None:
"""
May throw PferdLoadError or ConfigOptionError.
"""
# These two functions must run inside the same event loop as the
# crawlers, so that any new objects (like Conditions or Futures) can
# obtain the correct event loop.
self._load_authenticators()
loaded_crawlers = self._load_crawlers()
names = self._find_crawlers_to_run(loaded_crawlers)
log.print("")
for name in names:
crawler = self._crawlers[name]
2021-05-22 21:05:32 +02:00
log.print(f"[bold bright_cyan]Running[/] {escape(name)}")
try:
await crawler.run()
except (CrawlError, AuthError) as e:
2021-05-22 21:05:32 +02:00
log.error(str(e))
except Exception:
log.unexpected_exception()
for name in names:
crawler = self._crawlers[name]
log.report("")
log.report(f"[bold bright_cyan]Report[/] for {escape(name)}")
2021-05-24 13:17:28 +02:00
something_changed = False
for path in sorted(crawler.report.added_files):
2021-05-24 13:17:28 +02:00
something_changed = True
log.report(f" [bold bright_green]Added[/] {fmt_path(path)}")
for path in sorted(crawler.report.changed_files):
2021-05-24 13:17:28 +02:00
something_changed = True
log.report(f" [bold bright_yellow]Changed[/] {fmt_path(path)}")
for path in sorted(crawler.report.deleted_files):
2021-05-24 13:17:28 +02:00
something_changed = True
log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}")
2021-05-24 13:17:28 +02:00
if not something_changed:
log.report(" Nothing changed")