pferd/PFERD/crawlers/local.py

120 lines
3.8 KiB
Python
Raw Normal View History

2021-05-06 01:02:40 +02:00
import asyncio
2021-05-13 19:42:40 +02:00
import datetime
import random
2021-05-06 01:02:40 +02:00
from pathlib import Path, PurePath
from typing import Optional
2021-05-06 01:02:40 +02:00
from ..conductor import TerminalConductor
2021-05-06 01:02:40 +02:00
from ..config import Config
from ..crawler import Crawler, CrawlerSection, anoncritical
class LocalCrawlerSection(CrawlerSection):
def path(self) -> Path:
value = self.s.get("path")
if value is None:
self.missing_value("path")
return Path(value).expanduser()
def crawl_delay(self) -> float:
value = self.s.getfloat("crawl_delay", fallback=0.0)
if value < 0:
2021-05-15 00:39:55 +02:00
self.invalid_value("crawl_delay", value,
"Must not be negative")
return value
def download_delay(self) -> float:
value = self.s.getfloat("download_delay", fallback=0.0)
if value < 0:
2021-05-15 00:39:55 +02:00
self.invalid_value("download_delay", value,
"Must not be negative")
return value
def download_speed(self) -> Optional[int]:
value = self.s.getint("download_speed")
if value is not None and value <= 0:
2021-05-15 00:39:55 +02:00
self.invalid_value("download_speed", value,
"Must be greater than 0")
return value
2021-05-06 01:02:40 +02:00
class LocalCrawler(Crawler):
def __init__(
self,
name: str,
section: LocalCrawlerSection,
config: Config,
conductor: TerminalConductor,
2021-05-06 01:02:40 +02:00
):
super().__init__(name, section, config, conductor)
2021-05-06 01:02:40 +02:00
2021-05-09 01:33:47 +02:00
self._path = config.working_dir / section.path()
self._crawl_delay = section.crawl_delay()
self._download_delay = section.download_delay()
self._download_speed = section.download_speed()
if self._download_speed:
self._block_size = self._download_speed // 10
else:
self._block_size = 1024**2 # 1 MiB
2021-05-06 01:02:40 +02:00
async def crawl(self) -> None:
await self._crawl_path(self._path, PurePath())
if self.error_free:
2021-05-11 00:28:45 +02:00
await self.cleanup()
2021-05-06 01:02:40 +02:00
@anoncritical
async def _crawl_path(self, path: Path, pure: PurePath) -> None:
if path.is_dir():
await self._crawl_dir(path, pure)
elif path.is_file():
await self._crawl_file(path, pure)
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
tasks = []
2021-05-06 01:02:40 +02:00
async with self.crawl_bar(pure):
await asyncio.sleep(random.uniform(
0.5 * self._crawl_delay,
self._crawl_delay,
))
2021-05-06 01:02:40 +02:00
for child in path.iterdir():
pure_child = pure / child.name
2021-05-15 14:03:15 +02:00
if self.should_crawl(child):
tasks.append(self._crawl_path(child, pure_child))
2021-05-06 01:02:40 +02:00
await asyncio.gather(*tasks)
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
stat = path.stat()
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
dl = await self.download(pure, mtime=mtime)
if not dl:
return
2021-05-06 01:02:40 +02:00
async with self.download_bar(path) as bar:
await asyncio.sleep(random.uniform(
0.5 * self._download_delay,
self._download_delay,
))
2021-05-06 01:02:40 +02:00
2021-05-13 19:42:40 +02:00
bar.set_total(stat.st_size)
2021-05-06 01:02:40 +02:00
async with dl as sink:
with open(path, "rb") as f:
while True:
data = f.read(self._block_size)
2021-05-06 01:02:40 +02:00
if len(data) == 0:
break
2021-05-06 01:02:40 +02:00
sink.file.write(data)
bar.advance(len(data))
if self._download_speed:
delay = self._block_size / self._download_speed
delay = random.uniform(0.8 * delay, 1.2 * delay)
await asyncio.sleep(delay)
2021-05-06 01:02:40 +02:00
sink.done()