2021-04-29 13:44:29 +02:00
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from contextlib import asynccontextmanager
|
2021-05-05 23:45:10 +02:00
|
|
|
from pathlib import Path, PurePath
|
2021-04-29 13:53:16 +02:00
|
|
|
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
2021-05-05 23:36:54 +02:00
|
|
|
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
|
|
|
|
Callable, Optional, Protocol, TypeVar)
|
2021-04-29 13:44:29 +02:00
|
|
|
|
|
|
|
from rich.markup import escape
|
|
|
|
|
|
|
|
from .conductor import ProgressBar, TerminalConductor
|
2021-05-05 23:45:10 +02:00
|
|
|
from .config import Config, Section
|
2021-04-29 13:44:29 +02:00
|
|
|
from .limiter import Limiter
|
2021-05-05 18:08:34 +02:00
|
|
|
from .output_dir import OnConflict, OutputDirectory, Redownload
|
2021-04-29 13:44:29 +02:00
|
|
|
from .transformer import RuleParseException, Transformer
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerLoadException(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2021-05-05 23:36:54 +02:00
|
|
|
class CrawlerMemberFunction(Protocol):
|
|
|
|
def __call__(
|
|
|
|
self,
|
|
|
|
__self: "Crawler",
|
|
|
|
*__args: Any,
|
|
|
|
**__kwargs: Any,
|
|
|
|
) -> None:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
Wrapped = TypeVar("Wrapped", bound=CrawlerMemberFunction)
|
|
|
|
|
|
|
|
|
|
|
|
def noncritical(f: Wrapped) -> Wrapped:
|
|
|
|
def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
|
|
|
|
try:
|
|
|
|
f(self, *args, **kwargs)
|
|
|
|
except Exception as e:
|
|
|
|
self.print(f"[red]Something went wrong: {escape(str(e))}")
|
|
|
|
self._error_free = False
|
|
|
|
return wrapper # type: ignore
|
|
|
|
|
|
|
|
|
|
|
|
def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]:
|
|
|
|
def decorator(f: Wrapped) -> Wrapped:
|
|
|
|
def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
|
|
|
|
for _ in range(attempts - 1):
|
|
|
|
try:
|
|
|
|
f(self, *args, **kwargs)
|
|
|
|
return
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
f(self, *args, **kwargs)
|
|
|
|
return wrapper # type: ignore
|
|
|
|
return decorator
|
|
|
|
|
|
|
|
|
|
|
|
class ACrawlerMemberFunction(Protocol):
|
|
|
|
def __call__(
|
|
|
|
self,
|
|
|
|
__self: "Crawler",
|
|
|
|
*__args: Any,
|
|
|
|
**__kwargs: Any,
|
|
|
|
) -> Awaitable[None]:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
AWrapped = TypeVar("AWrapped", bound=ACrawlerMemberFunction)
|
|
|
|
|
|
|
|
|
|
|
|
def anoncritical(f: AWrapped) -> AWrapped:
|
|
|
|
async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
|
|
|
|
try:
|
|
|
|
await f(self, *args, **kwargs)
|
|
|
|
except Exception as e:
|
|
|
|
self.print(f"[red]Something went wrong: {escape(str(e))}")
|
|
|
|
self._error_free = False
|
|
|
|
return wrapper # type: ignore
|
|
|
|
|
|
|
|
|
|
|
|
def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
|
|
|
|
def decorator(f: AWrapped) -> AWrapped:
|
|
|
|
async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
|
|
|
|
for _ in range(attempts - 1):
|
|
|
|
try:
|
|
|
|
await f(self, *args, **kwargs)
|
|
|
|
return
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
await f(self, *args, **kwargs)
|
|
|
|
return wrapper # type: ignore
|
|
|
|
return decorator
|
2021-05-05 23:45:10 +02:00
|
|
|
|
|
|
|
|
|
|
|
class CrawlerSection(Section):
|
|
|
|
def output_dir(self, name: str) -> Path:
|
|
|
|
return Path(self.s.get("output_dir", name))
|
|
|
|
|
|
|
|
def redownload(self) -> Redownload:
|
|
|
|
value = self.s.get("redownload", "never-smart")
|
|
|
|
if value == "never":
|
|
|
|
return Redownload.NEVER
|
|
|
|
elif value == "never-smart":
|
|
|
|
return Redownload.NEVER_SMART
|
|
|
|
elif value == "always":
|
|
|
|
return Redownload.ALWAYS
|
|
|
|
elif value == "always-smart":
|
|
|
|
return Redownload.ALWAYS_SMART
|
|
|
|
self.invalid_value("redownload", value)
|
|
|
|
|
|
|
|
def on_conflict(self) -> OnConflict:
|
|
|
|
value = self.s.get("on_conflict", "prompt")
|
|
|
|
if value == "prompt":
|
|
|
|
return OnConflict.PROMPT
|
|
|
|
elif value == "local-first":
|
|
|
|
return OnConflict.LOCAL_FIRST
|
|
|
|
elif value == "remote-first":
|
|
|
|
return OnConflict.REMOTE_FIRST
|
|
|
|
elif value == "no-delete":
|
|
|
|
return OnConflict.NO_DELETE
|
|
|
|
self.invalid_value("on_conflict", value)
|
|
|
|
|
|
|
|
def transform(self) -> str:
|
|
|
|
return self.s.get("transform", "")
|
|
|
|
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
class Crawler(ABC):
|
2021-04-30 16:22:14 +02:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
config: Config,
|
2021-05-05 23:45:10 +02:00
|
|
|
section: CrawlerSection,
|
2021-04-30 16:22:14 +02:00
|
|
|
) -> None:
|
2021-04-29 13:44:29 +02:00
|
|
|
"""
|
2021-04-29 15:43:20 +02:00
|
|
|
Initialize a crawler from its name and its section in the config file.
|
|
|
|
|
|
|
|
If you are writing your own constructor for your own crawler, make sure
|
|
|
|
to call this constructor first (via super().__init__).
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
May throw a CrawlerLoadException.
|
|
|
|
"""
|
|
|
|
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
self._conductor = TerminalConductor()
|
|
|
|
self._limiter = Limiter()
|
|
|
|
|
|
|
|
try:
|
2021-05-05 23:45:10 +02:00
|
|
|
self._transformer = Transformer(section.transform())
|
2021-04-29 13:44:29 +02:00
|
|
|
except RuleParseException as e:
|
|
|
|
e.pretty_print()
|
|
|
|
raise CrawlerLoadException()
|
|
|
|
|
2021-05-05 18:08:34 +02:00
|
|
|
self._output_dir = OutputDirectory(
|
2021-05-05 23:45:10 +02:00
|
|
|
config.working_dir / section.output_dir(name),
|
|
|
|
section.redownload(),
|
|
|
|
section.on_conflict(),
|
|
|
|
self._conductor,
|
|
|
|
)
|
2021-04-29 13:44:29 +02:00
|
|
|
|
2021-05-05 23:36:54 +02:00
|
|
|
self._error_free = False
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
def print(self, text: str) -> None:
|
2021-04-29 15:43:20 +02:00
|
|
|
"""
|
|
|
|
Print rich markup to the terminal. Crawlers *must* use this function to
|
|
|
|
print things unless they are holding an exclusive output context
|
|
|
|
manager! Be careful to escape all user-supplied strings.
|
|
|
|
"""
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
self._conductor.print(text)
|
|
|
|
|
2021-04-29 15:47:52 +02:00
|
|
|
def exclusive_output(self) -> AsyncContextManager[None]:
|
2021-04-29 15:43:20 +02:00
|
|
|
"""
|
|
|
|
Acquire exclusive rights™ to the terminal output. While this context
|
|
|
|
manager is held, output such as printing and progress bars from other
|
|
|
|
threads is suspended and the current thread may do whatever it wants
|
|
|
|
with the terminal. However, it must return the terminal to its original
|
|
|
|
state before exiting the context manager.
|
|
|
|
|
|
|
|
No two threads can hold this context manager at the same time.
|
|
|
|
|
|
|
|
Useful for password or confirmation prompts as well as running other
|
|
|
|
programs while crawling (e. g. to get certain credentials).
|
|
|
|
"""
|
|
|
|
|
2021-04-29 15:26:10 +02:00
|
|
|
return self._conductor.exclusive_output()
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
@asynccontextmanager
|
|
|
|
async def progress_bar(
|
|
|
|
self,
|
2021-04-29 13:53:16 +02:00
|
|
|
desc: str,
|
2021-04-29 13:44:29 +02:00
|
|
|
total: Optional[int] = None,
|
|
|
|
) -> AsyncIterator[ProgressBar]:
|
|
|
|
async with self._limiter.limit():
|
|
|
|
with self._conductor.progress_bar(desc, total=total) as bar:
|
|
|
|
yield bar
|
|
|
|
|
2021-04-29 16:52:00 +02:00
|
|
|
def crawl_bar(self, path: PurePath) -> AsyncContextManager[ProgressBar]:
|
2021-04-29 14:23:09 +02:00
|
|
|
pathstr = escape(str(path))
|
|
|
|
desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}"
|
2021-04-29 13:53:16 +02:00
|
|
|
return self.progress_bar(desc)
|
|
|
|
|
|
|
|
def download_bar(
|
|
|
|
self,
|
2021-04-29 16:52:00 +02:00
|
|
|
path: PurePath,
|
2021-04-29 13:53:16 +02:00
|
|
|
size: int,
|
|
|
|
) -> AsyncContextManager[ProgressBar]:
|
2021-04-29 14:23:09 +02:00
|
|
|
pathstr = escape(str(path))
|
|
|
|
desc = f"[bold green]Downloading[/bold green] {pathstr}"
|
2021-04-29 13:53:16 +02:00
|
|
|
return self.progress_bar(desc, total=size)
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
async def run(self) -> None:
|
2021-04-29 15:43:20 +02:00
|
|
|
"""
|
|
|
|
Start the crawling process. Call this function if you want to use a
|
|
|
|
crawler.
|
|
|
|
"""
|
|
|
|
|
2021-04-29 14:23:28 +02:00
|
|
|
async with self._conductor:
|
2021-04-29 13:44:29 +02:00
|
|
|
await self.crawl()
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
async def crawl(self) -> None:
|
2021-04-29 15:43:20 +02:00
|
|
|
"""
|
|
|
|
Overwrite this function if you are writing a crawler.
|
|
|
|
|
|
|
|
This function must not return before all crawling is complete. To crawl
|
|
|
|
multiple things concurrently, asyncio.gather can be used.
|
|
|
|
"""
|
|
|
|
|
2021-04-29 13:44:29 +02:00
|
|
|
pass
|