diff --git a/CONFIG.md b/CONFIG.md index 92c36ae..2cac906 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -102,9 +102,15 @@ authenticators is `type`: ### The `local` crawler This crawler crawls a local directory. It is really simple and mostly useful for -testing different setups. +testing different setups. The various delay options are meant to make the +crawler simulate a slower, network-based crawler. - `path`: Path to the local directory to crawl. (Required) +- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl + requests. (Optional) +- `download_delay`: Maximum artificial delay (in seconds) to simulate for + download requests. (Optional) +- `download_speed`: Download speed (in bytes per second) to simulate. (Optional) ## Authenticator types @@ -114,8 +120,8 @@ With this authenticator, the username and password can be set directly in the config file. If the username or password are not specified, the user is prompted via the terminal. -- `username`: The username (Optional) -- `password`: The password (Optional) +- `username`: The username. (Optional) +- `password`: The password. (Optional) ## Transformation rules diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 5022a22..d50574e 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -17,6 +17,7 @@ class ProgressBar: def set_total(self, total: float) -> None: self._progress.update(self._taskid, total=total) + self._progress.start_task(self._taskid) class TerminalConductor: diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index fb08cc9..1677ff0 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -1,6 +1,8 @@ import asyncio import datetime +import random from pathlib import Path, PurePath +from typing import Optional from ..conductor import TerminalConductor from ..config import Config @@ -14,6 +16,24 @@ class LocalCrawlerSection(CrawlerSection): self.missing_value("path") return Path(value).expanduser() + def crawl_delay(self) -> Optional[float]: + value = self.s.getfloat("crawl_delay") + if value <= 0: + self.invalid_value("crawl_delay", value) + return value + + def download_delay(self) -> Optional[float]: + value = self.s.getfloat("download_delay") + if value <= 0: + self.invalid_value("download_delay", value) + return value + + def download_speed(self) -> Optional[int]: + value = self.s.getint("download_speed") + if value <= 0: + self.invalid_value("download_speed", value) + return value + class LocalCrawler(Crawler): def __init__( @@ -26,6 +46,14 @@ class LocalCrawler(Crawler): super().__init__(name, section, config, conductor) self._path = config.working_dir / section.path() + self._crawl_delay = section.crawl_delay() + self._download_delay = section.download_delay() + self._download_speed = section.download_speed() + + if self._download_speed: + self._block_size = self._download_speed // 10 + else: + self._block_size = 1024**2 # 1 MiB async def crawl(self) -> None: await self._crawl_path(self._path, PurePath()) @@ -41,28 +69,49 @@ class LocalCrawler(Crawler): async def _crawl_dir(self, path: Path, pure: PurePath) -> None: tasks = [] + async with self.crawl_bar(pure): + if self._crawl_delay: + await asyncio.sleep(random.uniform( + 0.5 * self._crawl_delay, + self._crawl_delay, + )) + for child in path.iterdir(): pure_child = pure / child.name tasks.append(self._crawl_path(child, pure_child)) + await asyncio.gather(*tasks) async def _crawl_file(self, path: Path, pure: PurePath) -> None: + stat = path.stat() + mtime = datetime.datetime.fromtimestamp(stat.st_mtime) + dl = await self.download(pure, mtime=mtime) + if not dl: + return + async with self.download_bar(path) as bar: - stat = path.stat() - mtime = datetime.datetime.fromtimestamp(stat.st_mtime) - dl = await self.download(pure, mtime=mtime) - if not dl: - return + if self._download_delay: + await asyncio.sleep(random.uniform( + 0.5 * self._download_delay, + self._download_delay, + )) bar.set_total(stat.st_size) async with dl as sink: with open(path, "rb") as f: while True: - data = f.read(1024**2) + data = f.read(self._block_size) if len(data) == 0: break + sink.file.write(data) bar.advance(len(data)) + + if self._download_speed: + delay = self._block_size / self._download_speed + delay = random.uniform(0.8 * delay, 1.2 * delay) + await asyncio.sleep(delay) + sink.done()