mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add options to slow down local crawler
These options are meant to make the local crawler behave more like a network-based crawler for purposes of testing and debugging other parts of the code base.
This commit is contained in:
parent
0c9167512c
commit
1591cb9197
12
CONFIG.md
12
CONFIG.md
@ -102,9 +102,15 @@ authenticators is `type`:
|
|||||||
### The `local` crawler
|
### The `local` crawler
|
||||||
|
|
||||||
This crawler crawls a local directory. It is really simple and mostly useful for
|
This crawler crawls a local directory. It is really simple and mostly useful for
|
||||||
testing different setups.
|
testing different setups. The various delay options are meant to make the
|
||||||
|
crawler simulate a slower, network-based crawler.
|
||||||
|
|
||||||
- `path`: Path to the local directory to crawl. (Required)
|
- `path`: Path to the local directory to crawl. (Required)
|
||||||
|
- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl
|
||||||
|
requests. (Optional)
|
||||||
|
- `download_delay`: Maximum artificial delay (in seconds) to simulate for
|
||||||
|
download requests. (Optional)
|
||||||
|
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
|
||||||
|
|
||||||
## Authenticator types
|
## Authenticator types
|
||||||
|
|
||||||
@ -114,8 +120,8 @@ With this authenticator, the username and password can be set directly in the
|
|||||||
config file. If the username or password are not specified, the user is prompted
|
config file. If the username or password are not specified, the user is prompted
|
||||||
via the terminal.
|
via the terminal.
|
||||||
|
|
||||||
- `username`: The username (Optional)
|
- `username`: The username. (Optional)
|
||||||
- `password`: The password (Optional)
|
- `password`: The password. (Optional)
|
||||||
|
|
||||||
## Transformation rules
|
## Transformation rules
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ class ProgressBar:
|
|||||||
|
|
||||||
def set_total(self, total: float) -> None:
|
def set_total(self, total: float) -> None:
|
||||||
self._progress.update(self._taskid, total=total)
|
self._progress.update(self._taskid, total=total)
|
||||||
|
self._progress.start_task(self._taskid)
|
||||||
|
|
||||||
|
|
||||||
class TerminalConductor:
|
class TerminalConductor:
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import datetime
|
import datetime
|
||||||
|
import random
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from ..conductor import TerminalConductor
|
from ..conductor import TerminalConductor
|
||||||
from ..config import Config
|
from ..config import Config
|
||||||
@ -14,6 +16,24 @@ class LocalCrawlerSection(CrawlerSection):
|
|||||||
self.missing_value("path")
|
self.missing_value("path")
|
||||||
return Path(value).expanduser()
|
return Path(value).expanduser()
|
||||||
|
|
||||||
|
def crawl_delay(self) -> Optional[float]:
|
||||||
|
value = self.s.getfloat("crawl_delay")
|
||||||
|
if value <= 0:
|
||||||
|
self.invalid_value("crawl_delay", value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
def download_delay(self) -> Optional[float]:
|
||||||
|
value = self.s.getfloat("download_delay")
|
||||||
|
if value <= 0:
|
||||||
|
self.invalid_value("download_delay", value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
def download_speed(self) -> Optional[int]:
|
||||||
|
value = self.s.getint("download_speed")
|
||||||
|
if value <= 0:
|
||||||
|
self.invalid_value("download_speed", value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
class LocalCrawler(Crawler):
|
class LocalCrawler(Crawler):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -26,6 +46,14 @@ class LocalCrawler(Crawler):
|
|||||||
super().__init__(name, section, config, conductor)
|
super().__init__(name, section, config, conductor)
|
||||||
|
|
||||||
self._path = config.working_dir / section.path()
|
self._path = config.working_dir / section.path()
|
||||||
|
self._crawl_delay = section.crawl_delay()
|
||||||
|
self._download_delay = section.download_delay()
|
||||||
|
self._download_speed = section.download_speed()
|
||||||
|
|
||||||
|
if self._download_speed:
|
||||||
|
self._block_size = self._download_speed // 10
|
||||||
|
else:
|
||||||
|
self._block_size = 1024**2 # 1 MiB
|
||||||
|
|
||||||
async def crawl(self) -> None:
|
async def crawl(self) -> None:
|
||||||
await self._crawl_path(self._path, PurePath())
|
await self._crawl_path(self._path, PurePath())
|
||||||
@ -41,28 +69,49 @@ class LocalCrawler(Crawler):
|
|||||||
|
|
||||||
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
|
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
async with self.crawl_bar(pure):
|
async with self.crawl_bar(pure):
|
||||||
|
if self._crawl_delay:
|
||||||
|
await asyncio.sleep(random.uniform(
|
||||||
|
0.5 * self._crawl_delay,
|
||||||
|
self._crawl_delay,
|
||||||
|
))
|
||||||
|
|
||||||
for child in path.iterdir():
|
for child in path.iterdir():
|
||||||
pure_child = pure / child.name
|
pure_child = pure / child.name
|
||||||
tasks.append(self._crawl_path(child, pure_child))
|
tasks.append(self._crawl_path(child, pure_child))
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
||||||
|
stat = path.stat()
|
||||||
|
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
|
||||||
|
dl = await self.download(pure, mtime=mtime)
|
||||||
|
if not dl:
|
||||||
|
return
|
||||||
|
|
||||||
async with self.download_bar(path) as bar:
|
async with self.download_bar(path) as bar:
|
||||||
stat = path.stat()
|
if self._download_delay:
|
||||||
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
|
await asyncio.sleep(random.uniform(
|
||||||
dl = await self.download(pure, mtime=mtime)
|
0.5 * self._download_delay,
|
||||||
if not dl:
|
self._download_delay,
|
||||||
return
|
))
|
||||||
|
|
||||||
bar.set_total(stat.st_size)
|
bar.set_total(stat.st_size)
|
||||||
|
|
||||||
async with dl as sink:
|
async with dl as sink:
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
while True:
|
while True:
|
||||||
data = f.read(1024**2)
|
data = f.read(self._block_size)
|
||||||
if len(data) == 0:
|
if len(data) == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
sink.file.write(data)
|
sink.file.write(data)
|
||||||
bar.advance(len(data))
|
bar.advance(len(data))
|
||||||
|
|
||||||
|
if self._download_speed:
|
||||||
|
delay = self._block_size / self._download_speed
|
||||||
|
delay = random.uniform(0.8 * delay, 1.2 * delay)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
Loading…
Reference in New Issue
Block a user