pferd/PFERD/crawler.py

345 lines
11 KiB
Python
Raw Normal View History

2021-04-29 13:44:29 +02:00
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
2021-05-06 01:02:40 +02:00
from datetime import datetime
2021-05-05 23:45:10 +02:00
from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
2021-05-09 01:45:01 +02:00
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
2021-05-13 18:57:20 +02:00
Callable, Dict, Optional, TypeVar)
2021-04-29 13:44:29 +02:00
2021-05-13 22:28:14 +02:00
import aiohttp
2021-04-29 13:44:29 +02:00
from rich.markup import escape
2021-05-13 18:57:20 +02:00
from .authenticator import Authenticator
2021-04-29 13:44:29 +02:00
from .conductor import ProgressBar, TerminalConductor
2021-05-05 23:45:10 +02:00
from .config import Config, Section
2021-04-29 13:44:29 +02:00
from .limiter import Limiter
2021-05-06 01:02:40 +02:00
from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload
2021-04-29 13:44:29 +02:00
from .transformer import RuleParseException, Transformer
2021-05-14 00:09:58 +02:00
from .version import __version__
2021-04-29 13:44:29 +02:00
class CrawlerLoadException(Exception):
pass
2021-05-09 01:45:01 +02:00
Wrapped = TypeVar("Wrapped", bound=Callable[..., None])
2021-05-09 01:45:01 +02:00
def noncritical(f: Wrapped) -> Wrapped:
"""
Warning: Must only be applied to member functions of the Crawler class!
2021-05-09 01:45:01 +02:00
Catches all exceptions occuring during the function call. If an exception
occurs, the crawler's error_free variable is set to False.
"""
def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
try:
f(self, *args, **kwargs)
except Exception as e:
self.print(f"[red]Something went wrong: {escape(str(e))}")
2021-05-06 01:02:40 +02:00
self.error_free = False
return wrapper # type: ignore
def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]:
2021-05-09 01:45:01 +02:00
"""
Warning: Must only be applied to member functions of the Crawler class!
If an exception occurs during the function call, retries the function call
a set amount of times. Exceptions that occur during the last attempt are
not caught and instead passed on upwards.
"""
def decorator(f: Wrapped) -> Wrapped:
def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
for _ in range(attempts - 1):
try:
f(self, *args, **kwargs)
return
except Exception:
pass
f(self, *args, **kwargs)
return wrapper # type: ignore
return decorator
2021-05-09 01:45:01 +02:00
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
2021-05-09 01:45:01 +02:00
def anoncritical(f: AWrapped) -> AWrapped:
"""
An async version of @noncritical.
Warning: Must only be applied to member functions of the Crawler class!
2021-05-09 01:45:01 +02:00
Catches all exceptions occuring during the function call. If an exception
occurs, the crawler's error_free variable is set to False.
"""
async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
try:
await f(self, *args, **kwargs)
except Exception as e:
self.print(f"[red]Something went wrong: {escape(str(e))}")
2021-05-06 01:02:40 +02:00
self.error_free = False
return wrapper # type: ignore
def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
2021-05-09 01:45:01 +02:00
"""
An async version of @noncritical.
Warning: Must only be applied to member functions of the Crawler class!
If an exception occurs during the function call, retries the function call
a set amount of times. Exceptions that occur during the last attempt are
not caught and instead passed on upwards.
"""
def decorator(f: AWrapped) -> AWrapped:
async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
for _ in range(attempts - 1):
try:
await f(self, *args, **kwargs)
return
except Exception:
pass
await f(self, *args, **kwargs)
return wrapper # type: ignore
return decorator
2021-05-05 23:45:10 +02:00
class CrawlerSection(Section):
def output_dir(self, name: str) -> Path:
2021-05-06 01:02:40 +02:00
return Path(self.s.get("output_dir", name)).expanduser()
2021-05-05 23:45:10 +02:00
def redownload(self) -> Redownload:
value = self.s.get("redownload", "never-smart")
if value == "never":
return Redownload.NEVER
elif value == "never-smart":
return Redownload.NEVER_SMART
elif value == "always":
return Redownload.ALWAYS
elif value == "always-smart":
return Redownload.ALWAYS_SMART
2021-05-15 00:39:55 +02:00
self.invalid_value(
"redownload",
value,
"Must be 'never', 'never-smart', 'always' or 'always-smart'"
)
2021-05-05 23:45:10 +02:00
def on_conflict(self) -> OnConflict:
value = self.s.get("on_conflict", "prompt")
if value == "prompt":
return OnConflict.PROMPT
elif value == "local-first":
return OnConflict.LOCAL_FIRST
elif value == "remote-first":
return OnConflict.REMOTE_FIRST
elif value == "no-delete":
return OnConflict.NO_DELETE
2021-05-15 00:39:55 +02:00
self.invalid_value(
"on_conflict",
value,
"Must be 'prompt', 'local-first', 'remote-first' or 'no-delete'",
)
2021-05-05 23:45:10 +02:00
def transform(self) -> str:
return self.s.get("transform", "")
def max_concurrent_crawls(self) -> int:
value = self.s.getint("max_concurrent_crawls", fallback=1)
if value <= 0:
self.invalid_value("max_concurrent_crawls", value,
"Must be greater than 0")
return value
def max_concurrent_downloads(self) -> int:
value = self.s.getint("max_concurrent_downloads", fallback=1)
if value <= 0:
self.invalid_value("max_concurrent_downloads", value,
"Must be greater than 0")
return value
def request_delay(self) -> float:
value = self.s.getfloat("request_delay", fallback=0.0)
if value < 0:
self.invalid_value("request_delay", value,
"Must be greater than or equal to 0")
return value
2021-05-13 18:57:20 +02:00
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
value = self.s.get("auth")
if value is None:
self.missing_value("auth")
auth = authenticators.get(f"auth:{value}")
if auth is None:
2021-05-15 00:39:55 +02:00
self.invalid_value("auth", value, "No such auth section exists")
2021-05-13 18:57:20 +02:00
return auth
2021-05-05 23:45:10 +02:00
2021-04-29 13:44:29 +02:00
class Crawler(ABC):
2021-04-30 16:22:14 +02:00
def __init__(
self,
name: str,
2021-05-05 23:45:10 +02:00
section: CrawlerSection,
config: Config,
conductor: TerminalConductor,
2021-04-30 16:22:14 +02:00
) -> None:
2021-04-29 13:44:29 +02:00
"""
2021-04-29 15:43:20 +02:00
Initialize a crawler from its name and its section in the config file.
If you are writing your own constructor for your own crawler, make sure
to call this constructor first (via super().__init__).
2021-04-29 13:44:29 +02:00
May throw a CrawlerLoadException.
"""
self.name = name
self._conductor = conductor
self.error_free = True
2021-04-29 13:44:29 +02:00
self._limiter = Limiter(
crawl_limit=section.max_concurrent_crawls(),
download_limit=section.max_concurrent_downloads(),
delay=section.request_delay(),
)
2021-04-29 13:44:29 +02:00
try:
2021-05-05 23:45:10 +02:00
self._transformer = Transformer(section.transform())
2021-04-29 13:44:29 +02:00
except RuleParseException as e:
e.pretty_print()
raise CrawlerLoadException()
2021-05-05 18:08:34 +02:00
self._output_dir = OutputDirectory(
2021-05-05 23:45:10 +02:00
config.working_dir / section.output_dir(name),
section.redownload(),
section.on_conflict(),
self._conductor,
)
2021-04-29 13:44:29 +02:00
def print(self, text: str) -> None:
2021-04-29 15:43:20 +02:00
"""
Print rich markup to the terminal. Crawlers *must* use this function to
print things unless they are holding an exclusive output context
manager! Be careful to escape all user-supplied strings.
"""
2021-04-29 13:44:29 +02:00
self._conductor.print(text)
2021-04-29 15:47:52 +02:00
def exclusive_output(self) -> AsyncContextManager[None]:
2021-04-29 15:43:20 +02:00
"""
Acquire exclusive rights to the terminal output. While this context
manager is held, output such as printing and progress bars from other
threads is suspended and the current thread may do whatever it wants
with the terminal. However, it must return the terminal to its original
state before exiting the context manager.
No two threads can hold this context manager at the same time.
Useful for password or confirmation prompts as well as running other
programs while crawling (e. g. to get certain credentials).
"""
2021-04-29 15:26:10 +02:00
return self._conductor.exclusive_output()
2021-04-29 13:44:29 +02:00
@asynccontextmanager
async def crawl_bar(
2021-04-29 13:44:29 +02:00
self,
path: PurePath,
2021-04-29 13:44:29 +02:00
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
async with self._limiter.limit_crawl():
2021-04-29 13:44:29 +02:00
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
@asynccontextmanager
async def download_bar(
self,
path: PurePath,
2021-05-06 01:02:40 +02:00
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
async with self._limiter.limit_download():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
2021-05-06 01:02:40 +02:00
async def download(
self,
path: PurePath,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
) -> Optional[AsyncContextManager[FileSink]]:
return await self._output_dir.download(
path, mtime, redownload, on_conflict)
async def cleanup(self) -> None:
await self._output_dir.cleanup()
2021-04-29 13:44:29 +02:00
async def run(self) -> None:
2021-04-29 15:43:20 +02:00
"""
Start the crawling process. Call this function if you want to use a
crawler.
"""
2021-04-29 14:23:28 +02:00
async with self._conductor:
2021-04-29 13:44:29 +02:00
await self.crawl()
@abstractmethod
async def crawl(self) -> None:
2021-04-29 15:43:20 +02:00
"""
Overwrite this function if you are writing a crawler.
This function must not return before all crawling is complete. To crawl
multiple things concurrently, asyncio.gather can be used.
"""
2021-04-29 13:44:29 +02:00
pass
2021-05-13 22:28:14 +02:00
class HttpCrawler(Crawler):
COOKIE_FILE = PurePath(".cookies")
def __init__(
self,
name: str,
section: CrawlerSection,
config: Config,
conductor: TerminalConductor,
) -> None:
super().__init__(name, section, config, conductor)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
async def run(self) -> None:
cookie_jar = aiohttp.CookieJar()
try:
cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
2021-05-14 00:09:58 +02:00
async with aiohttp.ClientSession(
headers={"User-Agent": f"pferd/{__version__}"},
cookie_jar=cookie_jar,
) as session:
2021-05-13 22:28:14 +02:00
self.session = session
try:
await super().run()
finally:
del self.session
try:
cookie_jar.save(self._cookie_jar_path)
except Exception:
2021-05-14 00:09:58 +02:00
self.print(
"[bold red]Warning:[/] Failed to save cookies to "
+ escape(str(self.COOKIE_FILE))
)