mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Implement Crawler and DummyCrawler
This commit is contained in:
parent
7e127cd5cc
commit
bbc792f9fb
60
PFERD/crawler.py
Normal file
60
PFERD/crawler.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import configparser
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import AsyncIterator, Optional
|
||||||
|
|
||||||
|
from rich.markup import escape
|
||||||
|
|
||||||
|
from .conductor import ProgressBar, TerminalConductor
|
||||||
|
from .limiter import Limiter
|
||||||
|
from .transformer import RuleParseException, Transformer
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerLoadException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(ABC):
|
||||||
|
def __init__(self, name: str, section: configparser.SectionProxy) -> None:
|
||||||
|
"""
|
||||||
|
May throw a CrawlerLoadException.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
self._conductor = TerminalConductor()
|
||||||
|
self._limiter = Limiter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._transformer = Transformer(section.get("transform", ""))
|
||||||
|
except RuleParseException as e:
|
||||||
|
e.pretty_print()
|
||||||
|
raise CrawlerLoadException()
|
||||||
|
|
||||||
|
# output_dir = Path(section.get("output_dir", name))
|
||||||
|
|
||||||
|
def print(self, text: str) -> None:
|
||||||
|
self._conductor.print(text)
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def progress_bar(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
total: Optional[int] = None,
|
||||||
|
) -> AsyncIterator[ProgressBar]:
|
||||||
|
desc = escape(str(path))
|
||||||
|
async with self._limiter.limit():
|
||||||
|
with self._conductor.progress_bar(desc, total=total) as bar:
|
||||||
|
yield bar
|
||||||
|
|
||||||
|
async def run(self) -> None:
|
||||||
|
await self._conductor.start()
|
||||||
|
try:
|
||||||
|
await self.crawl()
|
||||||
|
finally:
|
||||||
|
await self._conductor.stop()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl(self) -> None:
|
||||||
|
pass
|
5
PFERD/crawlers/__init__.py
Normal file
5
PFERD/crawlers/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from .dummy import DummyCrawler
|
||||||
|
|
||||||
|
CRAWLERS = {
|
||||||
|
"dummy": DummyCrawler,
|
||||||
|
}
|
53
PFERD/crawlers/dummy.py
Normal file
53
PFERD/crawlers/dummy.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rich.markup import escape
|
||||||
|
|
||||||
|
from ..crawler import Crawler
|
||||||
|
|
||||||
|
DUMMY_TREE = {
|
||||||
|
"Blätter": {
|
||||||
|
"Blatt_01.pdf": (),
|
||||||
|
"Blatt_02.pdf": (),
|
||||||
|
"Blatt_03.pdf": (),
|
||||||
|
"Blatt_04.pdf": (),
|
||||||
|
"Blatt_05.pdf": (),
|
||||||
|
"Blatt_01_Lösung.pdf": (),
|
||||||
|
"Blatt_02_Lösung.pdf": (),
|
||||||
|
"Blatt_03_Lösung.pdf": (),
|
||||||
|
"Blatt_04_Lösung.pdf": (),
|
||||||
|
"Blatt_05_Lösung.pdf": (),
|
||||||
|
},
|
||||||
|
"Vorlesungsfolien": {
|
||||||
|
"VL_01.pdf": (),
|
||||||
|
"VL_02.pdf": (),
|
||||||
|
"VL_03.pdf": (),
|
||||||
|
"VL_04.pdf": (),
|
||||||
|
"VL_05.pdf": (),
|
||||||
|
},
|
||||||
|
"noch_mehr.txt": (),
|
||||||
|
"dateien.jar": (),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DummyCrawler(Crawler):
|
||||||
|
async def crawl(self) -> None:
|
||||||
|
await self._crawl_entry(Path(), DUMMY_TREE)
|
||||||
|
|
||||||
|
async def _crawl_entry(self, path: Path, value: Any) -> None:
|
||||||
|
if value == ():
|
||||||
|
n = random.randint(5, 20)
|
||||||
|
async with self.progress_bar(path, n) as bar:
|
||||||
|
await asyncio.sleep(random.random() / 2)
|
||||||
|
for i in range(n):
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
bar.advance()
|
||||||
|
self.print(f"[green]Downloaded {escape(str(path))}")
|
||||||
|
else:
|
||||||
|
t = random.random() * 2 + 1
|
||||||
|
async with self.progress_bar(path) as bar:
|
||||||
|
await asyncio.sleep(t)
|
||||||
|
tasks = [self._crawl_entry(path / k, v) for k, v in value.items()]
|
||||||
|
await asyncio.gather(*tasks)
|
Loading…
Reference in New Issue
Block a user