Make limiter logic more complex

The limiter can now distinguish between crawl and download actions and has a
fancy slot system and delay logic.
This commit is contained in:
Joscha 2021-05-15 00:38:46 +02:00
parent 1591cb9197
commit 296a169dd3
4 changed files with 126 additions and 27 deletions

View File

@ -64,6 +64,17 @@ crawlers:
remote file is different. remote file is different.
- `transform`: Rules for renaming and excluding certain files and directories. - `transform`: Rules for renaming and excluding certain files and directories.
For more details, see [this section](#transformation-rules). (Default: empty) For more details, see [this section](#transformation-rules). (Default: empty)
- `max_concurrent_crawls`: The maximum number of concurrent crawl actions. What
constitutes a crawl action might vary from crawler to crawler, but it usually
means an HTTP request of a page to analyze. (Default: 1)
- `max_concurrent_downloads`: The maximum number of concurrent download actions.
What constitutes a download action might vary from crawler to crawler, but it
usually means an HTTP request for a single file. (Default: 1)
- `request_delay`: Time (in seconds) that the crawler should wait between
subsequent requests. Can be used to avoid unnecessary strain for the crawl
target. Crawl and download actions are handled separately, meaning that a
download action might immediately follow a crawl action even if this is set to
a nonzero value. (Default: 0)
Some crawlers may also require credentials for authentication. To configure how Some crawlers may also require credentials for authentication. To configure how
the crawler obtains its credentials, the `auth` option is used. It is set to the the crawler obtains its credentials, the `auth` option is used. It is set to the

View File

@ -33,8 +33,16 @@ class Section:
def error(self, key: str, desc: str) -> NoReturn: def error(self, key: str, desc: str) -> NoReturn:
raise ConfigFormatException(self.s.name, key, desc) raise ConfigFormatException(self.s.name, key, desc)
def invalid_value(self, key: str, value: Any) -> NoReturn: def invalid_value(
self.error(key, f"Invalid value: {value!r}") self,
key: str,
value: Any,
reason: Optional[str],
) -> NoReturn:
if reason is None:
self.error(key, f"Invalid value {value!r}")
else:
self.error(key, f"Invalid value {value!r}: {reason}")
def missing_value(self, key: str) -> NoReturn: def missing_value(self, key: str) -> NoReturn:
self.error(key, "Missing value") self.error(key, "Missing value")

View File

@ -139,6 +139,28 @@ class CrawlerSection(Section):
def transform(self) -> str: def transform(self) -> str:
return self.s.get("transform", "") return self.s.get("transform", "")
def max_concurrent_crawls(self) -> int:
value = self.s.getint("max_concurrent_crawls", fallback=1)
if value <= 0:
self.invalid_value("max_concurrent_crawls", value,
"Must be greater than 0")
return value
def max_concurrent_downloads(self) -> int:
value = self.s.getint("max_concurrent_downloads", fallback=1)
if value <= 0:
self.invalid_value("max_concurrent_downloads", value,
"Must be greater than 0")
return value
def request_delay(self) -> float:
value = self.s.getfloat("request_delay", fallback=0.0)
if value < 0:
self.invalid_value("request_delay", value,
"Must be greater than or equal to 0")
return value
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
value = self.s.get("auth") value = self.s.get("auth")
if value is None: if value is None:
@ -168,9 +190,14 @@ class Crawler(ABC):
self.name = name self.name = name
self._conductor = conductor self._conductor = conductor
self._limiter = Limiter()
self.error_free = True self.error_free = True
self._limiter = Limiter(
crawl_limit=section.max_concurrent_crawls(),
download_limit=section.max_concurrent_downloads(),
delay=section.request_delay(),
)
try: try:
self._transformer = Transformer(section.transform()) self._transformer = Transformer(section.transform())
except RuleParseException as e: except RuleParseException as e:
@ -210,28 +237,26 @@ class Crawler(ABC):
return self._conductor.exclusive_output() return self._conductor.exclusive_output()
@asynccontextmanager @asynccontextmanager
async def progress_bar( async def crawl_bar(
self,
desc: str,
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
async with self._limiter.limit():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
def crawl_bar(self, path: PurePath) -> AsyncContextManager[ProgressBar]:
pathstr = escape(str(path))
desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}"
return self.progress_bar(desc)
def download_bar(
self, self,
path: PurePath, path: PurePath,
total: Optional[int] = None, total: Optional[int] = None,
) -> AsyncContextManager[ProgressBar]: ) -> AsyncIterator[ProgressBar]:
pathstr = escape(str(path)) desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
desc = f"[bold green]Downloading[/bold green] {pathstr}" async with self._limiter.limit_crawl():
return self.progress_bar(desc, total=total) with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
@asynccontextmanager
async def download_bar(
self,
path: PurePath,
total: Optional[int] = None,
) -> AsyncIterator[ProgressBar]:
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
async with self._limiter.limit_download():
with self._conductor.progress_bar(desc, total=total) as bar:
yield bar
async def download( async def download(
self, self,

View File

@ -1,13 +1,68 @@
import asyncio import asyncio
import time
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import AsyncIterator from dataclasses import dataclass
from typing import AsyncContextManager, AsyncIterator, Optional
class Limiter: @dataclass
def __init__(self, limit: int = 10): class Slot:
self._semaphore = asyncio.Semaphore(limit) active: bool = False
last_left: Optional[float] = None
class SlotPool:
def __init__(self, limit: int, delay: float):
if limit <= 0:
raise ValueError("limit must be greater than 0")
self._slots = [Slot() for _ in range(limit)]
self._delay = delay
self._free = asyncio.Condition()
def _acquire_slot(self) -> Optional[Slot]:
for slot in self._slots:
if not slot.active:
slot.active = True
return slot
return None
def _release_slot(self, slot: Slot) -> None:
slot.last_left = time.time()
slot.active = False
@asynccontextmanager @asynccontextmanager
async def limit(self) -> AsyncIterator[None]: async def limit(self) -> AsyncIterator[None]:
async with self._semaphore: slot: Slot
async with self._free:
while True:
if found_slot := self._acquire_slot():
slot = found_slot
break
await self._free.wait()
if slot.last_left is not None:
delay = slot.last_left + self._delay - time.time()
if delay > 0:
await asyncio.sleep(delay)
try:
yield yield
finally:
async with self._free:
self._release_slot(slot)
self._free.notify()
class Limiter:
def __init__(self, crawl_limit: int, download_limit: int, delay: float):
self._crawl_pool = SlotPool(crawl_limit, delay)
self._download_pool = SlotPool(download_limit, delay)
def limit_crawl(self) -> AsyncContextManager[None]:
return self._crawl_pool.limit()
def limit_download(self) -> AsyncContextManager[None]:
return self._crawl_pool.limit()