Make limiter logic more complex

The limiter can now distinguish between crawl and download actions and has a
fancy slot system and delay logic.
This commit is contained in:
Joscha
2021-05-15 00:38:46 +02:00
parent 1591cb9197
commit 296a169dd3
4 changed files with 126 additions and 27 deletions

View File

@ -33,8 +33,16 @@ class Section:
def error(self, key: str, desc: str) -> NoReturn:
raise ConfigFormatException(self.s.name, key, desc)
def invalid_value(self, key: str, value: Any) -> NoReturn:
self.error(key, f"Invalid value: {value!r}")
def invalid_value(
self,
key: str,
value: Any,
reason: Optional[str],
) -> NoReturn:
if reason is None:
self.error(key, f"Invalid value {value!r}")
else:
self.error(key, f"Invalid value {value!r}: {reason}")
def missing_value(self, key: str) -> NoReturn:
self.error(key, "Missing value")

View File

@ -139,6 +139,28 @@ class CrawlerSection(Section):
def transform(self) -> str:
return self.s.get("transform", "")
def max_concurrent_crawls(self) -> int:
value = self.s.getint("max_concurrent_crawls", fallback=1)
if value <= 0:
self.invalid_value("max_concurrent_crawls", value,
"Must be greater than 0")
return value
def max_concurrent_downloads(self) -> int:
value = self.s.getint("max_concurrent_downloads", fallback=1)
if value <= 0:
self.invalid_value("max_concurrent_downloads", value,
"Must be greater than 0")
return value
def request_delay(self) -> float:
value = self.s.getfloat("request_delay", fallback=0.0)
if value < 0:
self.invalid_value("request_delay", value,
"Must be greater than or equal to 0")
return value
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
value = self.s.get("auth")
if value is None:
@ -168,9 +190,14 @@ class Crawler(ABC):
self.name = name
self._conductor = conductor
self._limiter = Limiter()
self.error_free = True
self._limiter = Limiter(
crawl_limit=section.max_concurrent_crawls(),
download_limit=section.max_concurrent_downloads(),
delay=section.request_delay(),
)
try:
self._transformer = Transformer(section.transform())
except RuleParseException as e:
@ -210,28 +237,26 @@ class Crawler(ABC):
return self._conductor.exclusive_output()
@asynccontextmanager
async def progress_bar(
self,
desc: str,
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
async with self._limiter.limit():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
def crawl_bar(self, path: PurePath) -> AsyncContextManager[ProgressBar]:
pathstr = escape(str(path))
desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}"
return self.progress_bar(desc)
def download_bar(
async def crawl_bar(
self,
path: PurePath,
total: Optional[int] = None,
) -> AsyncContextManager[ProgressBar]:
pathstr = escape(str(path))
desc = f"[bold green]Downloading[/bold green] {pathstr}"
return self.progress_bar(desc, total=total)
) -> AsyncIterator[ProgressBar]:
desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
async with self._limiter.limit_crawl():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
@asynccontextmanager
async def download_bar(
self,
path: PurePath,
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
async with self._limiter.limit_download():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
async def download(
self,

View File

@ -1,13 +1,68 @@
import asyncio
import time
from contextlib import asynccontextmanager
from typing import AsyncIterator
from dataclasses import dataclass
from typing import AsyncContextManager, AsyncIterator, Optional
class Limiter:
def __init__(self, limit: int = 10):
self._semaphore = asyncio.Semaphore(limit)
@dataclass
class Slot:
active: bool = False
last_left: Optional[float] = None
class SlotPool:
def __init__(self, limit: int, delay: float):
if limit <= 0:
raise ValueError("limit must be greater than 0")
self._slots = [Slot() for _ in range(limit)]
self._delay = delay
self._free = asyncio.Condition()
def _acquire_slot(self) -> Optional[Slot]:
for slot in self._slots:
if not slot.active:
slot.active = True
return slot
return None
def _release_slot(self, slot: Slot) -> None:
slot.last_left = time.time()
slot.active = False
@asynccontextmanager
async def limit(self) -> AsyncIterator[None]:
async with self._semaphore:
slot: Slot
async with self._free:
while True:
if found_slot := self._acquire_slot():
slot = found_slot
break
await self._free.wait()
if slot.last_left is not None:
delay = slot.last_left + self._delay - time.time()
if delay > 0:
await asyncio.sleep(delay)
try:
yield
finally:
async with self._free:
self._release_slot(slot)
self._free.notify()
class Limiter:
def __init__(self, crawl_limit: int, download_limit: int, delay: float):
self._crawl_pool = SlotPool(crawl_limit, delay)
self._download_pool = SlotPool(download_limit, delay)
def limit_crawl(self) -> AsyncContextManager[None]:
return self._crawl_pool.limit()
def limit_download(self) -> AsyncContextManager[None]:
return self._crawl_pool.limit()