pferd/PFERD/crawler.py

75 lines
2.1 KiB
Python
Raw Normal View History

2021-04-29 13:44:29 +02:00
import configparser
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from pathlib import Path
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import AsyncContextManager, AsyncIterator, Optional
2021-04-29 13:44:29 +02:00
from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor
from .limiter import Limiter
from .transformer import RuleParseException, Transformer
class CrawlerLoadException(Exception):
pass
class Crawler(ABC):
def __init__(self, name: str, section: configparser.SectionProxy) -> None:
"""
May throw a CrawlerLoadException.
"""
self.name = name
self._conductor = TerminalConductor()
self._limiter = Limiter()
try:
self._transformer = Transformer(section.get("transform", ""))
except RuleParseException as e:
e.pretty_print()
raise CrawlerLoadException()
# output_dir = Path(section.get("output_dir", name))
def print(self, text: str) -> None:
self._conductor.print(text)
2021-04-29 15:26:10 +02:00
def exclusive_output(self):
return self._conductor.exclusive_output()
2021-04-29 13:44:29 +02:00
@asynccontextmanager
async def progress_bar(
self,
desc: str,
2021-04-29 13:44:29 +02:00
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
async with self._limiter.limit():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
def crawl_bar(self, path: Path) -> AsyncContextManager[ProgressBar]:
2021-04-29 14:23:09 +02:00
pathstr = escape(str(path))
desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}"
return self.progress_bar(desc)
def download_bar(
self,
path: Path,
size: int,
) -> AsyncContextManager[ProgressBar]:
2021-04-29 14:23:09 +02:00
pathstr = escape(str(path))
desc = f"[bold green]Downloading[/bold green] {pathstr}"
return self.progress_bar(desc, total=size)
2021-04-29 13:44:29 +02:00
async def run(self) -> None:
2021-04-29 14:23:28 +02:00
async with self._conductor:
2021-04-29 13:44:29 +02:00
await self.crawl()
@abstractmethod
async def crawl(self) -> None:
pass