mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-04 06:32:52 +01:00 
			
		
		
		
	Add options to slow down local crawler
These options are meant to make the local crawler behave more like a network-based crawler for purposes of testing and debugging other parts of the code base.
This commit is contained in:
		
							
								
								
									
										12
									
								
								CONFIG.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								CONFIG.md
									
									
									
									
									
								
							@@ -102,9 +102,15 @@ authenticators is `type`:
 | 
			
		||||
### The `local` crawler
 | 
			
		||||
 | 
			
		||||
This crawler crawls a local directory. It is really simple and mostly useful for
 | 
			
		||||
testing different setups.
 | 
			
		||||
testing different setups. The various delay options are meant to make the
 | 
			
		||||
crawler simulate a slower, network-based crawler.
 | 
			
		||||
 | 
			
		||||
- `path`: Path to the local directory to crawl. (Required)
 | 
			
		||||
- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl
 | 
			
		||||
  requests. (Optional)
 | 
			
		||||
- `download_delay`: Maximum artificial delay (in seconds) to simulate for
 | 
			
		||||
  download requests. (Optional)
 | 
			
		||||
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
 | 
			
		||||
 | 
			
		||||
## Authenticator types
 | 
			
		||||
 | 
			
		||||
@@ -114,8 +120,8 @@ With this authenticator, the username and password can be set directly in the
 | 
			
		||||
config file. If the username or password are not specified, the user is prompted
 | 
			
		||||
via the terminal.
 | 
			
		||||
 | 
			
		||||
- `username`: The username (Optional)
 | 
			
		||||
- `password`: The password (Optional)
 | 
			
		||||
- `username`: The username. (Optional)
 | 
			
		||||
- `password`: The password. (Optional)
 | 
			
		||||
 | 
			
		||||
## Transformation rules
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,7 @@ class ProgressBar:
 | 
			
		||||
 | 
			
		||||
    def set_total(self, total: float) -> None:
 | 
			
		||||
        self._progress.update(self._taskid, total=total)
 | 
			
		||||
        self._progress.start_task(self._taskid)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TerminalConductor:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,8 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import datetime
 | 
			
		||||
import random
 | 
			
		||||
from pathlib import Path, PurePath
 | 
			
		||||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
from ..conductor import TerminalConductor
 | 
			
		||||
from ..config import Config
 | 
			
		||||
@@ -14,6 +16,24 @@ class LocalCrawlerSection(CrawlerSection):
 | 
			
		||||
            self.missing_value("path")
 | 
			
		||||
        return Path(value).expanduser()
 | 
			
		||||
 | 
			
		||||
    def crawl_delay(self) -> Optional[float]:
 | 
			
		||||
        value = self.s.getfloat("crawl_delay")
 | 
			
		||||
        if value <= 0:
 | 
			
		||||
            self.invalid_value("crawl_delay", value)
 | 
			
		||||
        return value
 | 
			
		||||
 | 
			
		||||
    def download_delay(self) -> Optional[float]:
 | 
			
		||||
        value = self.s.getfloat("download_delay")
 | 
			
		||||
        if value <= 0:
 | 
			
		||||
            self.invalid_value("download_delay", value)
 | 
			
		||||
        return value
 | 
			
		||||
 | 
			
		||||
    def download_speed(self) -> Optional[int]:
 | 
			
		||||
        value = self.s.getint("download_speed")
 | 
			
		||||
        if value <= 0:
 | 
			
		||||
            self.invalid_value("download_speed", value)
 | 
			
		||||
        return value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LocalCrawler(Crawler):
 | 
			
		||||
    def __init__(
 | 
			
		||||
@@ -26,6 +46,14 @@ class LocalCrawler(Crawler):
 | 
			
		||||
        super().__init__(name, section, config, conductor)
 | 
			
		||||
 | 
			
		||||
        self._path = config.working_dir / section.path()
 | 
			
		||||
        self._crawl_delay = section.crawl_delay()
 | 
			
		||||
        self._download_delay = section.download_delay()
 | 
			
		||||
        self._download_speed = section.download_speed()
 | 
			
		||||
 | 
			
		||||
        if self._download_speed:
 | 
			
		||||
            self._block_size = self._download_speed // 10
 | 
			
		||||
        else:
 | 
			
		||||
            self._block_size = 1024**2  # 1 MiB
 | 
			
		||||
 | 
			
		||||
    async def crawl(self) -> None:
 | 
			
		||||
        await self._crawl_path(self._path, PurePath())
 | 
			
		||||
@@ -41,28 +69,49 @@ class LocalCrawler(Crawler):
 | 
			
		||||
 | 
			
		||||
    async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
 | 
			
		||||
        tasks = []
 | 
			
		||||
 | 
			
		||||
        async with self.crawl_bar(pure):
 | 
			
		||||
            if self._crawl_delay:
 | 
			
		||||
                await asyncio.sleep(random.uniform(
 | 
			
		||||
                    0.5 * self._crawl_delay,
 | 
			
		||||
                    self._crawl_delay,
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
            for child in path.iterdir():
 | 
			
		||||
                pure_child = pure / child.name
 | 
			
		||||
                tasks.append(self._crawl_path(child, pure_child))
 | 
			
		||||
 | 
			
		||||
        await asyncio.gather(*tasks)
 | 
			
		||||
 | 
			
		||||
    async def _crawl_file(self, path: Path, pure: PurePath) -> None:
 | 
			
		||||
        async with self.download_bar(path) as bar:
 | 
			
		||||
        stat = path.stat()
 | 
			
		||||
        mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
 | 
			
		||||
        dl = await self.download(pure, mtime=mtime)
 | 
			
		||||
        if not dl:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        async with self.download_bar(path) as bar:
 | 
			
		||||
            if self._download_delay:
 | 
			
		||||
                await asyncio.sleep(random.uniform(
 | 
			
		||||
                    0.5 * self._download_delay,
 | 
			
		||||
                    self._download_delay,
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
            bar.set_total(stat.st_size)
 | 
			
		||||
 | 
			
		||||
            async with dl as sink:
 | 
			
		||||
                with open(path, "rb") as f:
 | 
			
		||||
                    while True:
 | 
			
		||||
                        data = f.read(1024**2)
 | 
			
		||||
                        data = f.read(self._block_size)
 | 
			
		||||
                        if len(data) == 0:
 | 
			
		||||
                            break
 | 
			
		||||
 | 
			
		||||
                        sink.file.write(data)
 | 
			
		||||
                        bar.advance(len(data))
 | 
			
		||||
 | 
			
		||||
                        if self._download_speed:
 | 
			
		||||
                            delay = self._block_size / self._download_speed
 | 
			
		||||
                            delay = random.uniform(0.8 * delay, 1.2 * delay)
 | 
			
		||||
                            await asyncio.sleep(delay)
 | 
			
		||||
 | 
			
		||||
                    sink.done()
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user