pferd/PFERD/crawl/crawler.py

359 lines
11 KiB
Python
Raw Normal View History

import asyncio
import os
2021-04-29 13:44:29 +02:00
from abc import ABC, abstractmethod
2022-04-27 22:50:06 +02:00
from collections.abc import Awaitable, Coroutine
2021-05-06 01:02:40 +02:00
from datetime import datetime
2021-05-05 23:45:10 +02:00
from pathlib import Path, PurePath
2022-04-27 22:50:06 +02:00
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
2021-04-29 13:44:29 +02:00
from ..auth import Authenticator
from ..config import Config, Section
from ..deduplicator import Deduplicator
from ..logging import ProgressBar, log
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
2021-05-23 20:46:12 +02:00
from ..report import MarkConflictError, MarkDuplicateError, Report
from ..transformer import Transformer
from ..utils import ReusableAsyncContextManager, fmt_path
2021-04-29 13:44:29 +02:00
class CrawlWarning(Exception):
pass
class CrawlError(Exception):
2021-04-29 13:44:29 +02:00
pass
2021-05-09 01:45:01 +02:00
Wrapped = TypeVar("Wrapped", bound=Callable[..., None])
2021-05-09 01:45:01 +02:00
def noncritical(f: Wrapped) -> Wrapped:
"""
Catches and logs a few noncritical exceptions occurring during the function
call, mainly CrawlWarning.
If any exception occurs during the function call, the crawler's error_free
variable is set to False. This includes noncritical exceptions.
Warning: Must only be applied to member functions of the Crawler class!
2021-05-09 01:45:01 +02:00
"""
def wrapper(*args: Any, **kwargs: Any) -> None:
if not (args and isinstance(args[0], Crawler)):
raise RuntimeError("@noncritical must only applied to Crawler methods")
crawler = args[0]
try:
f(*args, **kwargs)
except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e:
crawler.report.add_warning(str(e))
log.warn(str(e))
crawler.error_free = False
except Exception as e:
crawler.error_free = False
crawler.report.add_error(str(e))
raise
2021-05-09 01:45:01 +02:00
return wrapper # type: ignore
2022-04-27 22:50:06 +02:00
AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]])
2021-05-09 01:45:01 +02:00
def anoncritical(f: AWrapped) -> AWrapped:
"""
An async version of @noncritical.
Catches and logs a few noncritical exceptions occurring during the function
call, mainly CrawlWarning.
If any exception occurs during the function call, the crawler's error_free
variable is set to False. This includes noncritical exceptions.
Warning: Must only be applied to member functions of the Crawler class!
"""
async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
if not (args and isinstance(args[0], Crawler)):
raise RuntimeError("@anoncritical must only applied to Crawler methods")
crawler = args[0]
2021-05-09 01:45:01 +02:00
try:
return await f(*args, **kwargs)
except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e:
log.warn(str(e))
crawler.error_free = False
crawler.report.add_warning(str(e))
except Exception as e:
crawler.error_free = False
crawler.report.add_error(str(e))
raise
2021-05-09 01:45:01 +02:00
return None
return wrapper # type: ignore
2021-05-05 23:45:10 +02:00
class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
2022-11-23 22:28:40 +01:00
def __init__(self, path: PurePath):
super().__init__()
self._path = path
@property
def path(self) -> PurePath:
return self._path
async def _on_aenter(self) -> ProgressBar:
self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path)))
bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path)))
return bar
class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
2022-11-23 22:28:40 +01:00
def __init__(self, fs_token: FileSinkToken, path: PurePath):
super().__init__()
self._fs_token = fs_token
self._path = path
@property
def path(self) -> PurePath:
return self._path
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
sink = await self._stack.enter_async_context(self._fs_token)
# The "Downloaded ..." message is printed in the output dir, not here
bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading",
fmt_path(self._path)))
return bar, sink
2021-05-05 23:45:10 +02:00
class CrawlerSection(Section):
2021-06-04 18:33:02 +02:00
def type(self) -> str:
value = self.s.get("type")
if value is None:
self.missing_value("type")
return value
def skip(self) -> bool:
return self.s.getboolean("skip", fallback=False)
2021-05-05 23:45:10 +02:00
def output_dir(self, name: str) -> Path:
# TODO Use removeprefix() after switching to 3.9
if name.startswith("crawl:"):
name = name[len("crawl:"):]
2021-05-06 01:02:40 +02:00
return Path(self.s.get("output_dir", name)).expanduser()
2021-05-05 23:45:10 +02:00
def redownload(self) -> Redownload:
value = self.s.get("redownload", "never-smart")
2021-05-15 21:33:51 +02:00
try:
return Redownload.from_string(value)
except ValueError as e:
self.invalid_value(
"redownload",
value,
str(e).capitalize(),
)
2021-05-05 23:45:10 +02:00
def on_conflict(self) -> OnConflict:
value = self.s.get("on_conflict", "prompt")
2021-05-15 21:33:51 +02:00
try:
return OnConflict.from_string(value)
except ValueError as e:
self.invalid_value(
"on_conflict",
value,
str(e).capitalize(),
)
2021-05-05 23:45:10 +02:00
def transform(self) -> str:
return self.s.get("transform", "")
2021-05-25 14:12:19 +02:00
def tasks(self) -> int:
value = self.s.getint("tasks", fallback=1)
if value <= 0:
2021-05-25 14:12:19 +02:00
self.invalid_value("tasks", value, "Must be greater than 0")
return value
2021-05-25 14:12:19 +02:00
def downloads(self) -> int:
tasks = self.tasks()
value = self.s.getint("downloads", fallback=None)
if value is None:
return tasks
if value <= 0:
2021-05-25 14:12:19 +02:00
self.invalid_value("downloads", value, "Must be greater than 0")
if value > tasks:
2021-05-25 14:12:19 +02:00
self.invalid_value("downloads", value, "Must not be greater than tasks")
return value
2021-05-25 14:12:19 +02:00
def task_delay(self) -> float:
value = self.s.getfloat("task_delay", fallback=0.0)
if value < 0:
2021-05-25 14:12:19 +02:00
self.invalid_value("task_delay", value, "Must not be negative")
return value
2021-05-25 14:12:19 +02:00
def windows_paths(self) -> bool:
on_windows = os.name == "nt"
return self.s.getboolean("windows_paths", fallback=on_windows)
2021-05-13 18:57:20 +02:00
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
value = self.s.get("auth")
if value is None:
self.missing_value("auth")
auth = authenticators.get(value)
2021-05-13 18:57:20 +02:00
if auth is None:
2021-05-15 00:39:55 +02:00
self.invalid_value("auth", value, "No such auth section exists")
2021-05-13 18:57:20 +02:00
return auth
2021-05-05 23:45:10 +02:00
2021-04-29 13:44:29 +02:00
class Crawler(ABC):
2021-04-30 16:22:14 +02:00
def __init__(
self,
name: str,
2021-05-05 23:45:10 +02:00
section: CrawlerSection,
config: Config,
2021-04-30 16:22:14 +02:00
) -> None:
2021-04-29 13:44:29 +02:00
"""
2021-04-29 15:43:20 +02:00
Initialize a crawler from its name and its section in the config file.
If you are writing your own constructor for your own crawler, make sure
to call this constructor first (via super().__init__).
2021-04-29 13:44:29 +02:00
May throw a CrawlerLoadException.
"""
self.name = name
self.error_free = True
2021-04-29 13:44:29 +02:00
self._deduplicator = Deduplicator(section.windows_paths())
self._transformer = Transformer(section.transform())
2021-04-29 13:44:29 +02:00
2021-05-05 18:08:34 +02:00
self._output_dir = OutputDirectory(
config.default_section.working_dir() / section.output_dir(name),
2021-05-05 23:45:10 +02:00
section.redownload(),
section.on_conflict(),
)
2021-04-29 13:44:29 +02:00
2021-05-23 20:46:12 +02:00
@property
def report(self) -> Report:
return self._output_dir.report
@property
def prev_report(self) -> Optional[Report]:
return self._output_dir.prev_report
@staticmethod
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
"""
Similar to asyncio.gather. However, in the case of an exception, all
still running tasks are cancelled and the exception is rethrown.
This should always be preferred over asyncio.gather in crawler code so
that an exception like CrawlError may actually stop the crawler.
"""
tasks = [asyncio.ensure_future(aw) for aw in awaitables]
result = asyncio.gather(*tasks)
try:
return await result
except: # noqa: E722
for task in tasks:
task.cancel()
raise
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
path = self._deduplicator.mark(path)
self._output_dir.report.found(path)
if self._transformer.transform(path) is None:
log.explain("Answer: No")
log.status("[bold bright_black]", "Ignored", fmt_path(path))
return None
2021-05-06 01:02:40 +02:00
log.explain("Answer: Yes")
2022-11-23 22:28:40 +01:00
return CrawlToken(path)
2021-05-15 14:03:15 +02:00
2021-05-06 01:02:40 +02:00
async def download(
self,
path: PurePath,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
) -> Optional[DownloadToken]:
log.explain_topic(f"Decision: Download {fmt_path(path)}")
path = self._deduplicator.mark(path)
self._output_dir.report.found(path)
2021-05-15 14:03:15 +02:00
transformed_path = self._transformer.transform(path)
if transformed_path is None:
log.explain("Answer: No")
log.status("[bold bright_black]", "Ignored", fmt_path(path))
2021-05-15 14:03:15 +02:00
return None
fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict)
if fs_token is None:
log.explain("Answer: No")
return None
log.explain("Answer: Yes")
2022-11-23 22:28:40 +01:00
return DownloadToken(fs_token, path)
2021-05-06 01:02:40 +02:00
async def _cleanup(self) -> None:
log.explain_topic("Decision: Clean up files")
if self.error_free:
log.explain("No warnings or errors occurred during this run")
log.explain("Answer: Yes")
await self._output_dir.cleanup()
else:
log.explain("Warnings or errors occurred during this run")
log.explain("Answer: No")
@anoncritical
2021-04-29 13:44:29 +02:00
async def run(self) -> None:
2021-04-29 15:43:20 +02:00
"""
Start the crawling process. Call this function if you want to use a
crawler.
"""
with log.show_progress():
self._output_dir.prepare()
2021-05-23 20:46:12 +02:00
self._output_dir.load_prev_report()
await self._run()
await self._cleanup()
2021-05-23 20:46:12 +02:00
self._output_dir.store_report()
2021-04-29 13:44:29 +02:00
@abstractmethod
async def _run(self) -> None:
2021-04-29 15:43:20 +02:00
"""
Overwrite this function if you are writing a crawler.
This function must not return before all crawling is complete. To crawl
multiple things concurrently, asyncio.gather can be used.
"""
2021-04-29 13:44:29 +02:00
pass
2021-05-26 11:37:32 +02:00
def debug_transforms(self) -> None:
self._output_dir.load_prev_report()
if not self.prev_report:
log.warn("Couldn't find or load old report")
return
seen: Set[PurePath] = set()
for known in sorted(self.prev_report.found_paths):
2021-05-26 11:37:32 +02:00
looking_at = list(reversed(known.parents)) + [known]
for path in looking_at:
if path in seen:
continue
log.explain_topic(f"Transforming {fmt_path(path)}")
self._transformer.transform(path)
seen.add(path)