From bbc792f9fb7de4459da1fdaa55f24ea292333981 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 13:44:29 +0200 Subject: [PATCH] Implement Crawler and DummyCrawler --- PFERD/crawler.py | 60 ++++++++++++++++++++++++++++++++++++++ PFERD/crawlers/__init__.py | 5 ++++ PFERD/crawlers/dummy.py | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 PFERD/crawler.py create mode 100644 PFERD/crawlers/__init__.py create mode 100644 PFERD/crawlers/dummy.py diff --git a/PFERD/crawler.py b/PFERD/crawler.py new file mode 100644 index 0000000..9f1c7d9 --- /dev/null +++ b/PFERD/crawler.py @@ -0,0 +1,60 @@ +import configparser +from abc import ABC, abstractmethod +from contextlib import asynccontextmanager +from pathlib import Path +from typing import AsyncIterator, Optional + +from rich.markup import escape + +from .conductor import ProgressBar, TerminalConductor +from .limiter import Limiter +from .transformer import RuleParseException, Transformer + + +class CrawlerLoadException(Exception): + pass + + +class Crawler(ABC): + def __init__(self, name: str, section: configparser.SectionProxy) -> None: + """ + May throw a CrawlerLoadException. + """ + + self.name = name + + self._conductor = TerminalConductor() + self._limiter = Limiter() + + try: + self._transformer = Transformer(section.get("transform", "")) + except RuleParseException as e: + e.pretty_print() + raise CrawlerLoadException() + + # output_dir = Path(section.get("output_dir", name)) + + def print(self, text: str) -> None: + self._conductor.print(text) + + @asynccontextmanager + async def progress_bar( + self, + path: Path, + total: Optional[int] = None, + ) -> AsyncIterator[ProgressBar]: + desc = escape(str(path)) + async with self._limiter.limit(): + with self._conductor.progress_bar(desc, total=total) as bar: + yield bar + + async def run(self) -> None: + await self._conductor.start() + try: + await self.crawl() + finally: + await self._conductor.stop() + + @abstractmethod + async def crawl(self) -> None: + pass diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py new file mode 100644 index 0000000..5248a2d --- /dev/null +++ b/PFERD/crawlers/__init__.py @@ -0,0 +1,5 @@ +from .dummy import DummyCrawler + +CRAWLERS = { + "dummy": DummyCrawler, +} diff --git a/PFERD/crawlers/dummy.py b/PFERD/crawlers/dummy.py new file mode 100644 index 0000000..b4d787a --- /dev/null +++ b/PFERD/crawlers/dummy.py @@ -0,0 +1,53 @@ +import asyncio +import random +from pathlib import Path +from typing import Any + +from rich.markup import escape + +from ..crawler import Crawler + +DUMMY_TREE = { + "Blätter": { + "Blatt_01.pdf": (), + "Blatt_02.pdf": (), + "Blatt_03.pdf": (), + "Blatt_04.pdf": (), + "Blatt_05.pdf": (), + "Blatt_01_Lösung.pdf": (), + "Blatt_02_Lösung.pdf": (), + "Blatt_03_Lösung.pdf": (), + "Blatt_04_Lösung.pdf": (), + "Blatt_05_Lösung.pdf": (), + }, + "Vorlesungsfolien": { + "VL_01.pdf": (), + "VL_02.pdf": (), + "VL_03.pdf": (), + "VL_04.pdf": (), + "VL_05.pdf": (), + }, + "noch_mehr.txt": (), + "dateien.jar": (), +} + + +class DummyCrawler(Crawler): + async def crawl(self) -> None: + await self._crawl_entry(Path(), DUMMY_TREE) + + async def _crawl_entry(self, path: Path, value: Any) -> None: + if value == (): + n = random.randint(5, 20) + async with self.progress_bar(path, n) as bar: + await asyncio.sleep(random.random() / 2) + for i in range(n): + await asyncio.sleep(0.5) + bar.advance() + self.print(f"[green]Downloaded {escape(str(path))}") + else: + t = random.random() * 2 + 1 + async with self.progress_bar(path) as bar: + await asyncio.sleep(t) + tasks = [self._crawl_entry(path / k, v) for k, v in value.items()] + await asyncio.gather(*tasks)