Add options to slow down local crawler

These options are meant to make the local crawler behave more like a
network-based crawler for purposes of testing and debugging other parts of the
code base.
This commit is contained in:
Joscha 2021-05-14 21:41:24 +02:00
parent 0c9167512c
commit 1591cb9197
3 changed files with 65 additions and 9 deletions

View File

@ -102,9 +102,15 @@ authenticators is `type`:
### The `local` crawler ### The `local` crawler
This crawler crawls a local directory. It is really simple and mostly useful for This crawler crawls a local directory. It is really simple and mostly useful for
testing different setups. testing different setups. The various delay options are meant to make the
crawler simulate a slower, network-based crawler.
- `path`: Path to the local directory to crawl. (Required) - `path`: Path to the local directory to crawl. (Required)
- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl
requests. (Optional)
- `download_delay`: Maximum artificial delay (in seconds) to simulate for
download requests. (Optional)
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
## Authenticator types ## Authenticator types
@ -114,8 +120,8 @@ With this authenticator, the username and password can be set directly in the
config file. If the username or password are not specified, the user is prompted config file. If the username or password are not specified, the user is prompted
via the terminal. via the terminal.
- `username`: The username (Optional) - `username`: The username. (Optional)
- `password`: The password (Optional) - `password`: The password. (Optional)
## Transformation rules ## Transformation rules

View File

@ -17,6 +17,7 @@ class ProgressBar:
def set_total(self, total: float) -> None: def set_total(self, total: float) -> None:
self._progress.update(self._taskid, total=total) self._progress.update(self._taskid, total=total)
self._progress.start_task(self._taskid)
class TerminalConductor: class TerminalConductor:

View File

@ -1,6 +1,8 @@
import asyncio import asyncio
import datetime import datetime
import random
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import Optional
from ..conductor import TerminalConductor from ..conductor import TerminalConductor
from ..config import Config from ..config import Config
@ -14,6 +16,24 @@ class LocalCrawlerSection(CrawlerSection):
self.missing_value("path") self.missing_value("path")
return Path(value).expanduser() return Path(value).expanduser()
def crawl_delay(self) -> Optional[float]:
value = self.s.getfloat("crawl_delay")
if value <= 0:
self.invalid_value("crawl_delay", value)
return value
def download_delay(self) -> Optional[float]:
value = self.s.getfloat("download_delay")
if value <= 0:
self.invalid_value("download_delay", value)
return value
def download_speed(self) -> Optional[int]:
value = self.s.getint("download_speed")
if value <= 0:
self.invalid_value("download_speed", value)
return value
class LocalCrawler(Crawler): class LocalCrawler(Crawler):
def __init__( def __init__(
@ -26,6 +46,14 @@ class LocalCrawler(Crawler):
super().__init__(name, section, config, conductor) super().__init__(name, section, config, conductor)
self._path = config.working_dir / section.path() self._path = config.working_dir / section.path()
self._crawl_delay = section.crawl_delay()
self._download_delay = section.download_delay()
self._download_speed = section.download_speed()
if self._download_speed:
self._block_size = self._download_speed // 10
else:
self._block_size = 1024**2 # 1 MiB
async def crawl(self) -> None: async def crawl(self) -> None:
await self._crawl_path(self._path, PurePath()) await self._crawl_path(self._path, PurePath())
@ -41,28 +69,49 @@ class LocalCrawler(Crawler):
async def _crawl_dir(self, path: Path, pure: PurePath) -> None: async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
tasks = [] tasks = []
async with self.crawl_bar(pure): async with self.crawl_bar(pure):
if self._crawl_delay:
await asyncio.sleep(random.uniform(
0.5 * self._crawl_delay,
self._crawl_delay,
))
for child in path.iterdir(): for child in path.iterdir():
pure_child = pure / child.name pure_child = pure / child.name
tasks.append(self._crawl_path(child, pure_child)) tasks.append(self._crawl_path(child, pure_child))
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
async def _crawl_file(self, path: Path, pure: PurePath) -> None: async def _crawl_file(self, path: Path, pure: PurePath) -> None:
stat = path.stat()
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
dl = await self.download(pure, mtime=mtime)
if not dl:
return
async with self.download_bar(path) as bar: async with self.download_bar(path) as bar:
stat = path.stat() if self._download_delay:
mtime = datetime.datetime.fromtimestamp(stat.st_mtime) await asyncio.sleep(random.uniform(
dl = await self.download(pure, mtime=mtime) 0.5 * self._download_delay,
if not dl: self._download_delay,
return ))
bar.set_total(stat.st_size) bar.set_total(stat.st_size)
async with dl as sink: async with dl as sink:
with open(path, "rb") as f: with open(path, "rb") as f:
while True: while True:
data = f.read(1024**2) data = f.read(self._block_size)
if len(data) == 0: if len(data) == 0:
break break
sink.file.write(data) sink.file.write(data)
bar.advance(len(data)) bar.advance(len(data))
if self._download_speed:
delay = self._block_size / self._download_speed
delay = random.uniform(0.8 * delay, 1.2 * delay)
await asyncio.sleep(delay)
sink.done() sink.done()