2021-05-06 01:02:40 +02:00
|
|
|
import asyncio
|
2021-05-13 19:42:40 +02:00
|
|
|
import datetime
|
2021-05-06 01:02:40 +02:00
|
|
|
from pathlib import Path, PurePath
|
|
|
|
|
2021-05-10 23:50:16 +02:00
|
|
|
from ..conductor import TerminalConductor
|
2021-05-06 01:02:40 +02:00
|
|
|
from ..config import Config
|
|
|
|
from ..crawler import Crawler, CrawlerSection, anoncritical
|
|
|
|
|
|
|
|
|
|
|
|
class LocalCrawlerSection(CrawlerSection):
|
|
|
|
def path(self) -> Path:
|
|
|
|
value = self.s.get("path")
|
|
|
|
if value is None:
|
|
|
|
self.missing_value("path")
|
|
|
|
return Path(value).expanduser()
|
|
|
|
|
|
|
|
|
|
|
|
class LocalCrawler(Crawler):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name: str,
|
|
|
|
section: LocalCrawlerSection,
|
2021-05-10 23:50:16 +02:00
|
|
|
config: Config,
|
|
|
|
conductor: TerminalConductor,
|
2021-05-06 01:02:40 +02:00
|
|
|
):
|
2021-05-10 23:50:16 +02:00
|
|
|
super().__init__(name, section, config, conductor)
|
2021-05-06 01:02:40 +02:00
|
|
|
|
2021-05-09 01:33:47 +02:00
|
|
|
self._path = config.working_dir / section.path()
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
async def crawl(self) -> None:
|
|
|
|
await self._crawl_path(self._path, PurePath())
|
|
|
|
if self.error_free:
|
2021-05-11 00:28:45 +02:00
|
|
|
await self.cleanup()
|
2021-05-06 01:02:40 +02:00
|
|
|
|
|
|
|
@anoncritical
|
|
|
|
async def _crawl_path(self, path: Path, pure: PurePath) -> None:
|
|
|
|
if path.is_dir():
|
|
|
|
await self._crawl_dir(path, pure)
|
|
|
|
elif path.is_file():
|
|
|
|
await self._crawl_file(path, pure)
|
|
|
|
|
|
|
|
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
|
|
|
|
tasks = []
|
|
|
|
async with self.crawl_bar(pure):
|
|
|
|
for child in path.iterdir():
|
|
|
|
pure_child = pure / child.name
|
|
|
|
tasks.append(self._crawl_path(child, pure_child))
|
|
|
|
await asyncio.gather(*tasks)
|
|
|
|
|
|
|
|
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
|
|
|
async with self.download_bar(path) as bar:
|
2021-05-13 19:42:40 +02:00
|
|
|
stat = path.stat()
|
|
|
|
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
|
|
|
|
dl = await self.download(pure, mtime=mtime)
|
2021-05-06 01:02:40 +02:00
|
|
|
if not dl:
|
|
|
|
return
|
|
|
|
|
2021-05-13 19:42:40 +02:00
|
|
|
bar.set_total(stat.st_size)
|
|
|
|
|
2021-05-06 01:02:40 +02:00
|
|
|
async with dl as sink:
|
|
|
|
with open(path, "rb") as f:
|
|
|
|
while True:
|
|
|
|
data = f.read(1024**2)
|
|
|
|
if len(data) == 0:
|
|
|
|
break
|
|
|
|
sink.file.write(data)
|
|
|
|
bar.advance(len(data))
|
|
|
|
sink.done()
|