2021-05-06 01:02:40 +02:00
|
|
|
import asyncio
|
2021-05-13 19:42:40 +02:00
|
|
|
import datetime
|
2021-05-14 21:41:24 +02:00
|
|
|
import random
|
2021-05-06 01:02:40 +02:00
|
|
|
from pathlib import Path, PurePath
|
2021-05-14 21:41:24 +02:00
|
|
|
from typing import Optional
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
from ..config import Config
|
2021-05-23 19:16:42 +02:00
|
|
|
from .crawler import Crawler, CrawlerSection, anoncritical
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
class LocalCrawlerSection(CrawlerSection):
|
2021-05-15 17:12:25 +02:00
|
|
|
def target(self) -> Path:
|
|
|
|
value = self.s.get("target")
|
2021-05-06 01:02:40 +02:00
|
|
|
if value is None:
|
2021-05-15 17:12:25 +02:00
|
|
|
self.missing_value("target")
|
2021-05-06 01:02:40 +02:00
|
|
|
return Path(value).expanduser()
|
|
|
|
|
2021-05-15 13:32:13 +02:00
|
|
|
def crawl_delay(self) -> float:
|
|
|
|
value = self.s.getfloat("crawl_delay", fallback=0.0)
|
|
|
|
if value < 0:
|
2021-05-15 00:39:55 +02:00
|
|
|
self.invalid_value("crawl_delay", value,
|
2021-05-15 13:32:13 +02:00
|
|
|
"Must not be negative")
|
2021-05-14 21:41:24 +02:00
|
|
|
return value
|
|
|
|
|
2021-05-15 13:32:13 +02:00
|
|
|
def download_delay(self) -> float:
|
|
|
|
value = self.s.getfloat("download_delay", fallback=0.0)
|
|
|
|
if value < 0:
|
2021-05-15 00:39:55 +02:00
|
|
|
self.invalid_value("download_delay", value,
|
2021-05-15 13:32:13 +02:00
|
|
|
"Must not be negative")
|
2021-05-14 21:41:24 +02:00
|
|
|
return value
|
|
|
|
|
|
|
|
def download_speed(self) -> Optional[int]:
|
|
|
|
value = self.s.getint("download_speed")
|
2021-05-15 13:32:13 +02:00
|
|
|
if value is not None and value <= 0:
|
2021-05-15 00:39:55 +02:00
|
|
|
self.invalid_value("download_speed", value,
|
|
|
|
"Must be greater than 0")
|
2021-05-14 21:41:24 +02:00
|
|
|
return value
|
|
|
|
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
class LocalCrawler(Crawler):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
section: LocalCrawlerSection,
|
2021-05-10 23:50:16 +02:00
|
|
|
config: Config,
|
2021-05-06 01:02:40 +02:00
|
|
|
):
|
2021-05-18 22:43:46 +02:00
|
|
|
super().__init__(name, section, config)
|
2021-05-06 01:02:40 +02:00
|
|
|
|
2021-05-19 17:48:51 +02:00
|
|
|
self._target = config.default_section.working_dir() / section.target()
|
2021-05-14 21:41:24 +02:00
|
|
|
self._crawl_delay = section.crawl_delay()
|
|
|
|
self._download_delay = section.download_delay()
|
|
|
|
self._download_speed = section.download_speed()
|
|
|
|
|
|
|
|
if self._download_speed:
|
|
|
|
self._block_size = self._download_speed // 10
|
|
|
|
else:
|
|
|
|
self._block_size = 1024**2 # 1 MiB
|
2021-05-06 01:02:40 +02:00
|
|
|
|
2021-05-22 21:46:05 +02:00
|
|
|
async def _run(self) -> None:
|
2021-05-15 17:12:25 +02:00
|
|
|
await self._crawl_path(self._target, PurePath())
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
@anoncritical
|
|
|
|
async def _crawl_path(self, path: Path, pure: PurePath) -> None:
|
|
|
|
if path.is_dir():
|
|
|
|
await self._crawl_dir(path, pure)
|
|
|
|
elif path.is_file():
|
|
|
|
await self._crawl_file(path, pure)
|
|
|
|
|
|
|
|
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
|
2021-05-22 21:46:05 +02:00
|
|
|
cl = await self.crawl(pure)
|
|
|
|
if not cl:
|
|
|
|
return
|
|
|
|
|
|
|
|
async with cl:
|
2021-05-15 13:32:13 +02:00
|
|
|
await asyncio.sleep(random.uniform(
|
|
|
|
0.5 * self._crawl_delay,
|
|
|
|
self._crawl_delay,
|
|
|
|
))
|
2021-05-14 21:41:24 +02:00
|
|
|
|
2021-05-06 01:02:40 +02:00
|
|
|
for child in path.iterdir():
|
2021-05-25 11:58:01 +02:00
|
|
|
pure_child = cl.path / child.name
|
2022-10-26 19:41:34 +02:00
|
|
|
await self._crawl_path(child, pure_child)
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
2021-05-14 21:41:24 +02:00
|
|
|
stat = path.stat()
|
|
|
|
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
|
|
|
|
dl = await self.download(pure, mtime=mtime)
|
|
|
|
if not dl:
|
|
|
|
return
|
|
|
|
|
2021-05-22 21:46:05 +02:00
|
|
|
async with dl as (bar, sink):
|
2021-05-15 13:32:13 +02:00
|
|
|
await asyncio.sleep(random.uniform(
|
|
|
|
0.5 * self._download_delay,
|
|
|
|
self._download_delay,
|
|
|
|
))
|
2021-05-06 01:02:40 +02:00
|
|
|
|
2021-05-13 19:42:40 +02:00
|
|
|
bar.set_total(stat.st_size)
|
|
|
|
|
2021-05-22 21:46:05 +02:00
|
|
|
with open(path, "rb") as f:
|
|
|
|
while True:
|
|
|
|
data = f.read(self._block_size)
|
|
|
|
if len(data) == 0:
|
|
|
|
break
|
2021-05-14 21:41:24 +02:00
|
|
|
|
2021-05-22 21:46:05 +02:00
|
|
|
sink.file.write(data)
|
|
|
|
bar.advance(len(data))
|
2021-05-14 21:41:24 +02:00
|
|
|
|
2021-05-22 21:46:05 +02:00
|
|
|
if self._download_speed:
|
|
|
|
delay = self._block_size / self._download_speed
|
|
|
|
delay = random.uniform(0.8 * delay, 1.2 * delay)
|
|
|
|
await asyncio.sleep(delay)
|
2021-05-14 21:41:24 +02:00
|
|
|
|
2021-05-22 21:46:05 +02:00
|
|
|
sink.done()
|