mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Change limiter logic
Now download tasks are a subset of all tasks.
This commit is contained in:
parent
b0f9e1e8b4
commit
acd674f0a0
19
CONFIG.md
19
CONFIG.md
@ -64,17 +64,14 @@ crawlers:
|
|||||||
remote file is different.
|
remote file is different.
|
||||||
- `transform`: Rules for renaming and excluding certain files and directories.
|
- `transform`: Rules for renaming and excluding certain files and directories.
|
||||||
For more details, see [this section](#transformation-rules). (Default: empty)
|
For more details, see [this section](#transformation-rules). (Default: empty)
|
||||||
- `max_concurrent_crawls`: The maximum number of concurrent crawl actions. What
|
- `max_concurrent_tasks`: The maximum number of concurrent tasks (such as
|
||||||
constitutes a crawl action might vary from crawler to crawler, but it usually
|
crawling or downloading). (Default: 1)
|
||||||
means an HTTP request of a page to analyze. (Default: 1)
|
- `max_concurrent_downloads`: How many of those tasks can be download tasks at
|
||||||
- `max_concurrent_downloads`: The maximum number of concurrent download actions.
|
the same time. Must not be greater than `max_concurrent_tasks`. When not set,
|
||||||
What constitutes a download action might vary from crawler to crawler, but it
|
this is the same as `max_concurrent_tasks`. (Optional)
|
||||||
usually means an HTTP request for a single file. (Default: 1)
|
- `delay_between_tasks`: Time (in seconds) that the crawler should wait between
|
||||||
- `request_delay`: Time (in seconds) that the crawler should wait between
|
subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
|
||||||
subsequent requests. Can be used to avoid unnecessary strain for the crawl
|
load for the crawl target. (Default: 0.0)
|
||||||
target. Crawl and download actions are handled separately, meaning that a
|
|
||||||
download action might immediately follow a crawl action even if this is set to
|
|
||||||
a nonzero value. (Default: 0)
|
|
||||||
|
|
||||||
Some crawlers may also require credentials for authentication. To configure how
|
Some crawlers may also require credentials for authentication. To configure how
|
||||||
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
||||||
|
@ -149,26 +149,31 @@ class CrawlerSection(Section):
|
|||||||
def transform(self) -> str:
|
def transform(self) -> str:
|
||||||
return self.s.get("transform", "")
|
return self.s.get("transform", "")
|
||||||
|
|
||||||
def max_concurrent_crawls(self) -> int:
|
def max_concurrent_tasks(self) -> int:
|
||||||
value = self.s.getint("max_concurrent_crawls", fallback=1)
|
value = self.s.getint("max_concurrent_tasks", fallback=1)
|
||||||
if value <= 0:
|
if value <= 0:
|
||||||
self.invalid_value("max_concurrent_crawls", value,
|
self.invalid_value("max_concurrent_tasks", value,
|
||||||
"Must be greater than 0")
|
"Must be greater than 0")
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def max_concurrent_downloads(self) -> int:
|
def max_concurrent_downloads(self) -> int:
|
||||||
value = self.s.getint("max_concurrent_downloads", fallback=1)
|
tasks = self.max_concurrent_tasks()
|
||||||
|
value = self.s.getint("max_concurrent_downloads", fallback=None)
|
||||||
|
if value is None:
|
||||||
|
return tasks
|
||||||
if value <= 0:
|
if value <= 0:
|
||||||
self.invalid_value("max_concurrent_downloads", value,
|
self.invalid_value("max_concurrent_downloads", value,
|
||||||
"Must be greater than 0")
|
"Must be greater than 0")
|
||||||
|
if value > tasks:
|
||||||
|
self.invalid_value("max_concurrent_downloads", value,
|
||||||
|
"Must not be greater than max_concurrent_tasks")
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def request_delay(self) -> float:
|
def delay_between_tasks(self) -> float:
|
||||||
value = self.s.getfloat("request_delay", fallback=0.0)
|
value = self.s.getfloat("delay_between_tasks", fallback=0.0)
|
||||||
if value < 0:
|
if value < 0:
|
||||||
self.invalid_value("request_delay", value,
|
self.invalid_value("delay_between_tasks", value,
|
||||||
"Must be greater than or equal to 0")
|
"Must not be negative")
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
||||||
@ -203,9 +208,9 @@ class Crawler(ABC):
|
|||||||
self.error_free = True
|
self.error_free = True
|
||||||
|
|
||||||
self._limiter = Limiter(
|
self._limiter = Limiter(
|
||||||
crawl_limit=section.max_concurrent_crawls(),
|
task_limit=section.max_concurrent_tasks(),
|
||||||
download_limit=section.max_concurrent_downloads(),
|
download_limit=section.max_concurrent_downloads(),
|
||||||
delay=section.request_delay(),
|
task_delay=section.delay_between_tasks(),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -2,7 +2,7 @@ import asyncio
|
|||||||
import time
|
import time
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import AsyncContextManager, AsyncIterator, Optional
|
from typing import AsyncIterator, Optional
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -11,15 +11,27 @@ class Slot:
|
|||||||
last_left: Optional[float] = None
|
last_left: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
class SlotPool:
|
class Limiter:
|
||||||
def __init__(self, limit: int, delay: float):
|
def __init__(
|
||||||
if limit <= 0:
|
self,
|
||||||
raise ValueError("limit must be greater than 0")
|
task_limit: int,
|
||||||
|
download_limit: int,
|
||||||
|
task_delay: float
|
||||||
|
):
|
||||||
|
if task_limit <= 0:
|
||||||
|
raise ValueError("task limit must be at least 1")
|
||||||
|
if download_limit <= 0:
|
||||||
|
raise ValueError("download limit must be at least 1")
|
||||||
|
if download_limit > task_limit:
|
||||||
|
raise ValueError("download limit can't be greater than task limit")
|
||||||
|
if task_delay < 0:
|
||||||
|
raise ValueError("Task delay must not be negative")
|
||||||
|
|
||||||
self._slots = [Slot() for _ in range(limit)]
|
self._slots = [Slot() for _ in range(task_limit)]
|
||||||
self._delay = delay
|
self._downloads = download_limit
|
||||||
|
self._delay = task_delay
|
||||||
|
|
||||||
self._free = asyncio.Condition()
|
self._condition = asyncio.Condition()
|
||||||
|
|
||||||
def _acquire_slot(self) -> Optional[Slot]:
|
def _acquire_slot(self) -> Optional[Slot]:
|
||||||
for slot in self._slots:
|
for slot in self._slots:
|
||||||
@ -29,40 +41,57 @@ class SlotPool:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _release_slot(self, slot: Slot) -> None:
|
async def _wait_for_slot_delay(self, slot: Slot) -> None:
|
||||||
slot.last_left = time.time()
|
|
||||||
slot.active = False
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def limit(self) -> AsyncIterator[None]:
|
|
||||||
slot: Slot
|
|
||||||
async with self._free:
|
|
||||||
while True:
|
|
||||||
if found_slot := self._acquire_slot():
|
|
||||||
slot = found_slot
|
|
||||||
break
|
|
||||||
await self._free.wait()
|
|
||||||
|
|
||||||
if slot.last_left is not None:
|
if slot.last_left is not None:
|
||||||
delay = slot.last_left + self._delay - time.time()
|
delay = slot.last_left + self._delay - time.time()
|
||||||
if delay > 0:
|
if delay > 0:
|
||||||
await asyncio.sleep(delay)
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
def _release_slot(self, slot: Slot) -> None:
|
||||||
|
slot.last_left = time.time()
|
||||||
|
slot.active = False
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def limit_crawl(self) -> AsyncIterator[None]:
|
||||||
|
slot: Slot
|
||||||
|
async with self._condition:
|
||||||
|
while True:
|
||||||
|
if found_slot := self._acquire_slot():
|
||||||
|
slot = found_slot
|
||||||
|
break
|
||||||
|
await self._condition.wait()
|
||||||
|
|
||||||
|
await self._wait_for_slot_delay(slot)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
finally:
|
finally:
|
||||||
async with self._free:
|
async with self._condition:
|
||||||
self._release_slot(slot)
|
self._release_slot(slot)
|
||||||
self._free.notify()
|
self._condition.notify_all()
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def limit_download(self) -> AsyncIterator[None]:
|
||||||
|
slot: Slot
|
||||||
|
async with self._condition:
|
||||||
|
while True:
|
||||||
|
if self._downloads <= 0:
|
||||||
|
await self._condition.wait()
|
||||||
|
continue
|
||||||
|
|
||||||
class Limiter:
|
if found_slot := self._acquire_slot():
|
||||||
def __init__(self, crawl_limit: int, download_limit: int, delay: float):
|
slot = found_slot
|
||||||
self._crawl_pool = SlotPool(crawl_limit, delay)
|
self._downloads -= 1
|
||||||
self._download_pool = SlotPool(download_limit, delay)
|
break
|
||||||
|
|
||||||
def limit_crawl(self) -> AsyncContextManager[None]:
|
await self._condition.wait()
|
||||||
return self._crawl_pool.limit()
|
|
||||||
|
|
||||||
def limit_download(self) -> AsyncContextManager[None]:
|
await self._wait_for_slot_delay(slot)
|
||||||
return self._crawl_pool.limit()
|
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
async with self._condition:
|
||||||
|
self._release_slot(slot)
|
||||||
|
self._downloads += 1
|
||||||
|
self._condition.notify_all()
|
||||||
|
Loading…
Reference in New Issue
Block a user