2021-05-23 17:25:16 +02:00
|
|
|
import asyncio
|
2021-05-25 11:58:01 +02:00
|
|
|
import os
|
2021-04-29 13:44:29 +02:00
|
|
|
from abc import ABC, abstractmethod
|
2022-04-27 22:50:06 +02:00
|
|
|
from collections.abc import Awaitable, Coroutine
|
2021-05-06 01:02:40 +02:00
|
|
|
from datetime import datetime
|
2021-05-05 23:45:10 +02:00
|
|
|
from pathlib import Path, PurePath
|
2022-04-27 22:50:06 +02:00
|
|
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
|
2021-04-29 13:44:29 +02:00
|
|
|
|
2021-05-23 19:16:42 +02:00
|
|
|
from ..auth import Authenticator
|
|
|
|
from ..config import Config, Section
|
2021-05-25 11:58:01 +02:00
|
|
|
from ..deduplicator import Deduplicator
|
2021-05-23 19:16:42 +02:00
|
|
|
from ..logging import ProgressBar, log
|
|
|
|
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
|
2021-05-23 20:46:12 +02:00
|
|
|
from ..report import MarkConflictError, MarkDuplicateError, Report
|
2021-05-23 19:16:42 +02:00
|
|
|
from ..transformer import Transformer
|
|
|
|
from ..utils import ReusableAsyncContextManager, fmt_path
|
2021-04-29 13:44:29 +02:00
|
|
|
|
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
class CrawlWarning(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlError(Exception):
|
2021-04-29 13:44:29 +02:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
2021-05-09 01:45:01 +02:00
|
|
|
Wrapped = TypeVar("Wrapped", bound=Callable[..., None])
|
2021-05-05 23:36:54 +02:00
|
|
|
|
|
|
|
|
2021-05-09 01:45:01 +02:00
|
|
|
def noncritical(f: Wrapped) -> Wrapped:
|
|
|
|
"""
|
2021-05-23 13:23:28 +02:00
|
|
|
Catches and logs a few noncritical exceptions occurring during the function
|
|
|
|
call, mainly CrawlWarning.
|
|
|
|
|
|
|
|
If any exception occurs during the function call, the crawler's error_free
|
|
|
|
variable is set to False. This includes noncritical exceptions.
|
2021-05-19 13:25:57 +02:00
|
|
|
|
|
|
|
Warning: Must only be applied to member functions of the Crawler class!
|
2021-05-09 01:45:01 +02:00
|
|
|
"""
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
def wrapper(*args: Any, **kwargs: Any) -> None:
|
|
|
|
if not (args and isinstance(args[0], Crawler)):
|
|
|
|
raise RuntimeError("@noncritical must only applied to Crawler methods")
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
crawler = args[0]
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
try:
|
|
|
|
f(*args, **kwargs)
|
2021-05-22 20:54:42 +02:00
|
|
|
except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e:
|
2021-11-07 21:40:22 +01:00
|
|
|
crawler.report.add_warning(str(e))
|
2021-05-22 20:54:42 +02:00
|
|
|
log.warn(str(e))
|
2021-05-19 13:25:57 +02:00
|
|
|
crawler.error_free = False
|
2021-11-07 21:40:22 +01:00
|
|
|
except Exception as e:
|
2021-05-19 13:25:57 +02:00
|
|
|
crawler.error_free = False
|
2021-11-07 21:40:22 +01:00
|
|
|
crawler.report.add_error(str(e))
|
2021-05-19 13:25:57 +02:00
|
|
|
raise
|
2021-05-09 01:45:01 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
return wrapper # type: ignore
|
2021-05-05 23:36:54 +02:00
|
|
|
|
|
|
|
|
2022-04-27 22:50:06 +02:00
|
|
|
AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]])
|
2021-05-05 23:36:54 +02:00
|
|
|
|
|
|
|
|
2021-05-09 01:45:01 +02:00
|
|
|
def anoncritical(f: AWrapped) -> AWrapped:
|
|
|
|
"""
|
|
|
|
An async version of @noncritical.
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-05-23 13:23:28 +02:00
|
|
|
Catches and logs a few noncritical exceptions occurring during the function
|
|
|
|
call, mainly CrawlWarning.
|
|
|
|
|
|
|
|
If any exception occurs during the function call, the crawler's error_free
|
|
|
|
variable is set to False. This includes noncritical exceptions.
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
Warning: Must only be applied to member functions of the Crawler class!
|
|
|
|
"""
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-07-03 12:07:18 +02:00
|
|
|
async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
|
2021-05-19 13:25:57 +02:00
|
|
|
if not (args and isinstance(args[0], Crawler)):
|
|
|
|
raise RuntimeError("@anoncritical must only applied to Crawler methods")
|
2021-05-05 23:36:54 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
crawler = args[0]
|
2021-05-09 01:45:01 +02:00
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
try:
|
2021-07-03 12:07:18 +02:00
|
|
|
return await f(*args, **kwargs)
|
2021-05-23 13:23:28 +02:00
|
|
|
except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e:
|
|
|
|
log.warn(str(e))
|
2021-05-19 13:25:57 +02:00
|
|
|
crawler.error_free = False
|
2021-11-07 21:40:22 +01:00
|
|
|
crawler.report.add_warning(str(e))
|
|
|
|
except Exception as e:
|
2021-05-19 13:25:57 +02:00
|
|
|
crawler.error_free = False
|
2021-11-07 21:40:22 +01:00
|
|
|
crawler.report.add_error(str(e))
|
2021-05-19 13:25:57 +02:00
|
|
|
raise
|
2021-05-09 01:45:01 +02:00
|
|
|
|
2021-07-03 12:07:18 +02:00
|
|
|
return None
|
|
|
|
|
2021-05-19 13:25:57 +02:00
|
|
|
return wrapper # type: ignore
|
2021-05-05 23:45:10 +02:00
|
|
|
|
|
|
|
|
2021-05-22 21:36:53 +02:00
|
|
|
class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
|
2022-11-23 22:28:40 +01:00
|
|
|
def __init__(self, path: PurePath):
|
2021-05-22 21:36:53 +02:00
|
|
|
super().__init__()
|
|
|
|
|
2021-05-23 16:22:58 +02:00
|
|
|
self._path = path
|
2021-05-22 21:36:53 +02:00
|
|
|
|
2021-05-25 11:58:01 +02:00
|
|
|
@property
|
|
|
|
def path(self) -> PurePath:
|
|
|
|
return self._path
|
|
|
|
|
2021-05-22 21:36:53 +02:00
|
|
|
async def _on_aenter(self) -> ProgressBar:
|
2021-05-31 12:21:05 +02:00
|
|
|
self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path)))
|
|
|
|
bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path)))
|
2021-05-22 21:36:53 +02:00
|
|
|
|
|
|
|
return bar
|
|
|
|
|
|
|
|
|
|
|
|
class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
|
2022-11-23 22:28:40 +01:00
|
|
|
def __init__(self, fs_token: FileSinkToken, path: PurePath):
|
2021-05-22 21:36:53 +02:00
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
self._fs_token = fs_token
|
2021-05-23 16:22:58 +02:00
|
|
|
self._path = path
|
2021-05-22 21:36:53 +02:00
|
|
|
|
2021-05-25 11:58:01 +02:00
|
|
|
@property
|
|
|
|
def path(self) -> PurePath:
|
|
|
|
return self._path
|
|
|
|
|
2021-05-22 21:36:53 +02:00
|
|
|
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
|
|
|
|
sink = await self._stack.enter_async_context(self._fs_token)
|
2021-05-31 12:21:05 +02:00
|
|
|
# The "Downloaded ..." message is printed in the output dir, not here
|
|
|
|
bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading",
|
|
|
|
fmt_path(self._path)))
|
2021-05-22 21:36:53 +02:00
|
|
|
|
|
|
|
return bar, sink
|
|
|
|
|
|
|
|
|
2021-05-05 23:45:10 +02:00
|
|
|
class CrawlerSection(Section):
|
2021-06-04 18:33:02 +02:00
|
|
|
def type(self) -> str:
|
|
|
|
value = self.s.get("type")
|
|
|
|
if value is None:
|
|
|
|
self.missing_value("type")
|
|
|
|
return value
|
|
|
|
|
|
|
|
def skip(self) -> bool:
|
|
|
|
return self.s.getboolean("skip", fallback=False)
|
|
|
|
|
2021-05-05 23:45:10 +02:00
|
|
|
def output_dir(self, name: str) -> Path:
|
2021-05-15 17:23:33 +02:00
|
|
|
# TODO Use removeprefix() after switching to 3.9
|
|
|
|
if name.startswith("crawl:"):
|
|
|
|
name = name[len("crawl:"):]
|
2021-05-06 01:02:40 +02:00
|
|
|
return Path(self.s.get("output_dir", name)).expanduser()
|
2021-05-05 23:45:10 +02:00
|
|
|
|
|
|
|
def redownload(self) -> Redownload:
|
|
|
|
value = self.s.get("redownload", "never-smart")
|
2021-05-15 21:33:51 +02:00
|
|
|
try:
|
|
|
|
return Redownload.from_string(value)
|
|
|
|
except ValueError as e:
|
|
|
|
self.invalid_value(
|
|
|
|
"redownload",
|
|
|
|
value,
|
|
|
|
str(e).capitalize(),
|
|
|
|
)
|
2021-05-05 23:45:10 +02:00
|
|
|
|
|
|
|
def on_conflict(self) -> OnConflict:
|
|
|
|
value = self.s.get("on_conflict", "prompt")
|
2021-05-15 21:33:51 +02:00
|
|
|
try:
|
|
|
|
return OnConflict.from_string(value)
|
|
|
|
except ValueError as e:
|
|
|
|
self.invalid_value(
|
|
|
|
"on_conflict",
|
|
|
|
value,
|
|
|
|
str(e).capitalize(),
|
|
|
|
)
|
2021-05-05 23:45:10 +02:00
|
|
|
|
|
|
|
def transform(self) -> str:
|
|
|
|
return self.s.get("transform", "")
|
|
|
|
|
2021-05-25 14:12:19 +02:00
|
|
|
def tasks(self) -> int:
|
|
|
|
value = self.s.getint("tasks", fallback=1)
|
2021-05-15 00:38:46 +02:00
|
|
|
if value <= 0:
|
2021-05-25 14:12:19 +02:00
|
|
|
self.invalid_value("tasks", value, "Must be greater than 0")
|
2021-05-15 00:38:46 +02:00
|
|
|
return value
|
|
|
|
|
2021-05-25 14:12:19 +02:00
|
|
|
def downloads(self) -> int:
|
|
|
|
tasks = self.tasks()
|
|
|
|
value = self.s.getint("downloads", fallback=None)
|
2021-05-15 13:21:38 +02:00
|
|
|
if value is None:
|
|
|
|
return tasks
|
2021-05-15 00:38:46 +02:00
|
|
|
if value <= 0:
|
2021-05-25 14:12:19 +02:00
|
|
|
self.invalid_value("downloads", value, "Must be greater than 0")
|
2021-05-15 13:21:38 +02:00
|
|
|
if value > tasks:
|
2021-05-25 14:12:19 +02:00
|
|
|
self.invalid_value("downloads", value, "Must not be greater than tasks")
|
2021-05-15 00:38:46 +02:00
|
|
|
return value
|
|
|
|
|
2021-05-25 14:12:19 +02:00
|
|
|
def task_delay(self) -> float:
|
|
|
|
value = self.s.getfloat("task_delay", fallback=0.0)
|
2021-05-15 00:38:46 +02:00
|
|
|
if value < 0:
|
2021-05-25 14:12:19 +02:00
|
|
|
self.invalid_value("task_delay", value, "Must not be negative")
|
2021-05-15 00:38:46 +02:00
|
|
|
return value
|
|
|
|
|
2021-05-25 14:12:19 +02:00
|
|
|
def windows_paths(self) -> bool:
|
|
|
|
on_windows = os.name == "nt"
|
|
|
|
return self.s.getboolean("windows_paths", fallback=on_windows)
|
|
|
|
|
2021-05-13 18:57:20 +02:00
|
|
|
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
|
|
|
value = self.s.get("auth")
|
|
|
|
if value is None:
|
|
|
|
self.missing_value("auth")
|
2021-05-15 15:18:16 +02:00
|
|
|
auth = authenticators.get(value)
|
2021-05-13 18:57:20 +02:00
|
|
|
if auth is None:
|
2021-05-15 00:39:55 +02:00
|
|
|
self.invalid_value("auth", value, "No such auth section exists")
|
2021-05-13 18:57:20 +02:00
|
|
|
return auth
|
|
|
|
|
2021-05-05 23:45:10 +02:00
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
class Crawler(ABC):
|
2021-04-30 16:22:14 +02:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name: str,
|
2021-05-05 23:45:10 +02:00
|
|
|
section: CrawlerSection,
|
2021-05-10 23:50:16 +02:00
|
|
|
config: Config,
|
2021-04-30 16:22:14 +02:00
|
|
|
) -> None:
|
2021-04-29 13:44:29 +02:00
|
|
|
"""
|
2021-04-29 15:43:20 +02:00
|
|
|
Initialize a crawler from its name and its section in the config file.
|
|
|
|
|
|
|
|
If you are writing your own constructor for your own crawler, make sure
|
|
|
|
to call this constructor first (via super().__init__).
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
May throw a CrawlerLoadException.
|
|
|
|
"""
|
|
|
|
|
|
|
|
self.name = name
|
2021-05-10 23:50:16 +02:00
|
|
|
self.error_free = True
|
2021-04-29 13:44:29 +02:00
|
|
|
|
2021-05-25 11:58:01 +02:00
|
|
|
self._deduplicator = Deduplicator(section.windows_paths())
|
2021-05-19 13:25:57 +02:00
|
|
|
self._transformer = Transformer(section.transform())
|
2021-04-29 13:44:29 +02:00
|
|
|
|
2021-05-05 18:08:34 +02:00
|
|
|
self._output_dir = OutputDirectory(
|
2021-05-19 17:48:51 +02:00
|
|
|
config.default_section.working_dir() / section.output_dir(name),
|
2021-05-05 23:45:10 +02:00
|
|
|
section.redownload(),
|
|
|
|
section.on_conflict(),
|
|
|
|
)
|
2021-04-29 13:44:29 +02:00
|
|
|
|
2021-05-23 20:46:12 +02:00
|
|
|
@property
|
|
|
|
def report(self) -> Report:
|
|
|
|
return self._output_dir.report
|
|
|
|
|
|
|
|
@property
|
|
|
|
def prev_report(self) -> Optional[Report]:
|
|
|
|
return self._output_dir.prev_report
|
|
|
|
|
2021-05-23 17:25:16 +02:00
|
|
|
@staticmethod
|
|
|
|
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
|
|
|
|
"""
|
|
|
|
Similar to asyncio.gather. However, in the case of an exception, all
|
|
|
|
still running tasks are cancelled and the exception is rethrown.
|
|
|
|
|
|
|
|
This should always be preferred over asyncio.gather in crawler code so
|
|
|
|
that an exception like CrawlError may actually stop the crawler.
|
|
|
|
"""
|
|
|
|
|
|
|
|
tasks = [asyncio.ensure_future(aw) for aw in awaitables]
|
|
|
|
result = asyncio.gather(*tasks)
|
|
|
|
try:
|
|
|
|
return await result
|
|
|
|
except: # noqa: E722
|
|
|
|
for task in tasks:
|
|
|
|
task.cancel()
|
|
|
|
raise
|
|
|
|
|
2021-05-22 21:36:53 +02:00
|
|
|
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
2021-05-23 11:30:16 +02:00
|
|
|
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
2021-05-25 11:58:01 +02:00
|
|
|
path = self._deduplicator.mark(path)
|
2021-05-31 12:28:11 +02:00
|
|
|
self._output_dir.report.found(path)
|
2021-05-22 22:39:57 +02:00
|
|
|
|
2021-05-22 21:36:53 +02:00
|
|
|
if self._transformer.transform(path) is None:
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: No")
|
2021-05-31 12:21:05 +02:00
|
|
|
log.status("[bold bright_black]", "Ignored", fmt_path(path))
|
2021-05-22 21:36:53 +02:00
|
|
|
return None
|
2021-05-06 01:02:40 +02:00
|
|
|
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: Yes")
|
2022-11-23 22:28:40 +01:00
|
|
|
return CrawlToken(path)
|
2021-05-15 14:03:15 +02:00
|
|
|
|
2021-05-06 01:02:40 +02:00
|
|
|
async def download(
|
|
|
|
self,
|
|
|
|
path: PurePath,
|
|
|
|
mtime: Optional[datetime] = None,
|
|
|
|
redownload: Optional[Redownload] = None,
|
|
|
|
on_conflict: Optional[OnConflict] = None,
|
2021-05-22 21:36:53 +02:00
|
|
|
) -> Optional[DownloadToken]:
|
2021-05-23 11:30:16 +02:00
|
|
|
log.explain_topic(f"Decision: Download {fmt_path(path)}")
|
2021-05-25 11:58:01 +02:00
|
|
|
path = self._deduplicator.mark(path)
|
2021-05-31 12:28:11 +02:00
|
|
|
self._output_dir.report.found(path)
|
2021-05-22 22:39:57 +02:00
|
|
|
|
2021-05-15 14:03:15 +02:00
|
|
|
transformed_path = self._transformer.transform(path)
|
|
|
|
if transformed_path is None:
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: No")
|
2021-05-31 12:21:05 +02:00
|
|
|
log.status("[bold bright_black]", "Ignored", fmt_path(path))
|
2021-05-15 14:03:15 +02:00
|
|
|
return None
|
|
|
|
|
2021-05-23 16:22:58 +02:00
|
|
|
fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict)
|
2021-05-22 21:36:53 +02:00
|
|
|
if fs_token is None:
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: No")
|
2021-05-22 21:36:53 +02:00
|
|
|
return None
|
|
|
|
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: Yes")
|
2022-11-23 22:28:40 +01:00
|
|
|
return DownloadToken(fs_token, path)
|
2021-05-06 01:02:40 +02:00
|
|
|
|
2021-05-22 21:45:51 +02:00
|
|
|
async def _cleanup(self) -> None:
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain_topic("Decision: Clean up files")
|
2021-05-22 21:45:51 +02:00
|
|
|
if self.error_free:
|
|
|
|
log.explain("No warnings or errors occurred during this run")
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: Yes")
|
2021-05-22 21:45:51 +02:00
|
|
|
await self._output_dir.cleanup()
|
|
|
|
else:
|
|
|
|
log.explain("Warnings or errors occurred during this run")
|
2021-05-22 22:39:57 +02:00
|
|
|
log.explain("Answer: No")
|
2021-04-29 13:53:16 +02:00
|
|
|
|
2021-07-13 15:42:11 +02:00
|
|
|
@anoncritical
|
2021-04-29 13:44:29 +02:00
|
|
|
async def run(self) -> None:
|
2021-04-29 15:43:20 +02:00
|
|
|
"""
|
|
|
|
Start the crawling process. Call this function if you want to use a
|
|
|
|
crawler.
|
|
|
|
"""
|
|
|
|
|
2021-05-18 22:43:46 +02:00
|
|
|
with log.show_progress():
|
2021-05-23 10:52:02 +02:00
|
|
|
self._output_dir.prepare()
|
2021-05-23 20:46:12 +02:00
|
|
|
self._output_dir.load_prev_report()
|
2021-05-22 21:36:53 +02:00
|
|
|
await self._run()
|
2021-05-22 21:45:51 +02:00
|
|
|
await self._cleanup()
|
2021-05-23 20:46:12 +02:00
|
|
|
self._output_dir.store_report()
|
2021-04-29 13:44:29 +02:00
|
|
|
|
|
|
|
@abstractmethod
|
2021-05-22 21:36:53 +02:00
|
|
|
async def _run(self) -> None:
|
2021-04-29 15:43:20 +02:00
|
|
|
"""
|
|
|
|
Overwrite this function if you are writing a crawler.
|
|
|
|
|
|
|
|
This function must not return before all crawling is complete. To crawl
|
|
|
|
multiple things concurrently, asyncio.gather can be used.
|
|
|
|
"""
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
pass
|
2021-05-26 11:37:32 +02:00
|
|
|
|
|
|
|
def debug_transforms(self) -> None:
|
|
|
|
self._output_dir.load_prev_report()
|
|
|
|
|
|
|
|
if not self.prev_report:
|
|
|
|
log.warn("Couldn't find or load old report")
|
|
|
|
return
|
|
|
|
|
|
|
|
seen: Set[PurePath] = set()
|
2021-05-31 12:28:11 +02:00
|
|
|
for known in sorted(self.prev_report.found_paths):
|
2021-05-26 11:37:32 +02:00
|
|
|
looking_at = list(reversed(known.parents)) + [known]
|
|
|
|
for path in looking_at:
|
|
|
|
if path in seen:
|
|
|
|
continue
|
|
|
|
|
|
|
|
log.explain_topic(f"Transforming {fmt_path(path)}")
|
|
|
|
self._transformer.transform(path)
|
|
|
|
seen.add(path)
|