Implement Crawler and DummyCrawler

This commit is contained in:
Joscha 2021-04-29 13:44:29 +02:00
parent 7e127cd5cc
commit bbc792f9fb
3 changed files with 118 additions and 0 deletions

60
PFERD/crawler.py Normal file
View File

@ -0,0 +1,60 @@
import configparser
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from pathlib import Path
from typing import AsyncIterator, Optional
from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor
from .limiter import Limiter
from .transformer import RuleParseException, Transformer
class CrawlerLoadException(Exception):
pass
class Crawler(ABC):
def __init__(self, name: str, section: configparser.SectionProxy) -> None:
"""
May throw a CrawlerLoadException.
"""
self.name = name
self._conductor = TerminalConductor()
self._limiter = Limiter()
try:
self._transformer = Transformer(section.get("transform", ""))
except RuleParseException as e:
e.pretty_print()
raise CrawlerLoadException()
# output_dir = Path(section.get("output_dir", name))
def print(self, text: str) -> None:
self._conductor.print(text)
@asynccontextmanager
async def progress_bar(
self,
path: Path,
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
desc = escape(str(path))
async with self._limiter.limit():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
async def run(self) -> None:
await self._conductor.start()
try:
await self.crawl()
finally:
await self._conductor.stop()
@abstractmethod
async def crawl(self) -> None:
pass

View File

@ -0,0 +1,5 @@
from .dummy import DummyCrawler
CRAWLERS = {
"dummy": DummyCrawler,
}

53
PFERD/crawlers/dummy.py Normal file
View File

@ -0,0 +1,53 @@
import asyncio
import random
from pathlib import Path
from typing import Any
from rich.markup import escape
from ..crawler import Crawler
DUMMY_TREE = {
"Blätter": {
"Blatt_01.pdf": (),
"Blatt_02.pdf": (),
"Blatt_03.pdf": (),
"Blatt_04.pdf": (),
"Blatt_05.pdf": (),
"Blatt_01_Lösung.pdf": (),
"Blatt_02_Lösung.pdf": (),
"Blatt_03_Lösung.pdf": (),
"Blatt_04_Lösung.pdf": (),
"Blatt_05_Lösung.pdf": (),
},
"Vorlesungsfolien": {
"VL_01.pdf": (),
"VL_02.pdf": (),
"VL_03.pdf": (),
"VL_04.pdf": (),
"VL_05.pdf": (),
},
"noch_mehr.txt": (),
"dateien.jar": (),
}
class DummyCrawler(Crawler):
async def crawl(self) -> None:
await self._crawl_entry(Path(), DUMMY_TREE)
async def _crawl_entry(self, path: Path, value: Any) -> None:
if value == ():
n = random.randint(5, 20)
async with self.progress_bar(path, n) as bar:
await asyncio.sleep(random.random() / 2)
for i in range(n):
await asyncio.sleep(0.5)
bar.advance()
self.print(f"[green]Downloaded {escape(str(path))}")
else:
t = random.random() * 2 + 1
async with self.progress_bar(path) as bar:
await asyncio.sleep(t)
tasks = [self._crawl_entry(path / k, v) for k, v in value.items()]
await asyncio.gather(*tasks)