diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 161a287..76d0e2a 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -14,6 +14,9 @@ class ProgressBar: def advance(self, amount: float = 1) -> None: self._progress.advance(self._taskid, advance=amount) + def set_total(self, total) -> None: + self._progress.update(self._taskid, total=total) + class TerminalConductor: def __init__(self) -> None: diff --git a/PFERD/crawler.py b/PFERD/crawler.py index ff779ab..d088b21 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,16 +1,17 @@ from abc import ABC, abstractmethod from contextlib import asynccontextmanager +from datetime import datetime from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, - Callable, Optional, Protocol, TypeVar) +from typing import (Any, AsyncContextManager, AsyncIterator, Callable, + Coroutine, Optional, Protocol, TypeVar) from rich.markup import escape from .conductor import ProgressBar, TerminalConductor from .config import Config, Section from .limiter import Limiter -from .output_dir import OnConflict, OutputDirectory, Redownload +from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer @@ -37,7 +38,7 @@ def noncritical(f: Wrapped) -> Wrapped: f(self, *args, **kwargs) except Exception as e: self.print(f"[red]Something went wrong: {escape(str(e))}") - self._error_free = False + self.error_free = False return wrapper # type: ignore @@ -61,7 +62,7 @@ class ACrawlerMemberFunction(Protocol): __self: "Crawler", *__args: Any, **__kwargs: Any, - ) -> Awaitable[None]: + ) -> Coroutine[Any, Any, None]: pass @@ -74,7 +75,7 @@ def anoncritical(f: AWrapped) -> AWrapped: await f(self, *args, **kwargs) except Exception as e: self.print(f"[red]Something went wrong: {escape(str(e))}") - self._error_free = False + self.error_free = False return wrapper # type: ignore @@ -94,7 +95,7 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: class CrawlerSection(Section): def output_dir(self, name: str) -> Path: - return Path(self.s.get("output_dir", name)) + return Path(self.s.get("output_dir", name)).expanduser() def redownload(self) -> Redownload: value = self.s.get("redownload", "never-smart") @@ -158,7 +159,7 @@ class Crawler(ABC): self._conductor, ) - self._error_free = False + self.error_free = False def print(self, text: str) -> None: """ @@ -203,11 +204,24 @@ class Crawler(ABC): def download_bar( self, path: PurePath, - size: int, + total: Optional[int] = None, ) -> AsyncContextManager[ProgressBar]: pathstr = escape(str(path)) desc = f"[bold green]Downloading[/bold green] {pathstr}" - return self.progress_bar(desc, total=size) + return self.progress_bar(desc, total=total) + + async def download( + self, + path: PurePath, + mtime: Optional[datetime] = None, + redownload: Optional[Redownload] = None, + on_conflict: Optional[OnConflict] = None, + ) -> Optional[AsyncContextManager[FileSink]]: + return await self._output_dir.download( + path, mtime, redownload, on_conflict) + + async def cleanup(self) -> None: + await self._output_dir.cleanup() async def run(self) -> None: """ diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 69dac39..15ef403 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -4,7 +4,9 @@ from typing import Callable, Dict from ..config import Config from ..crawler import Crawler, CrawlerSection from .dummy import DummyCrawler +from .local import LocalCrawler, LocalCrawlerSection CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)), + "local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)), } diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py new file mode 100644 index 0000000..77ebf81 --- /dev/null +++ b/PFERD/crawlers/local.py @@ -0,0 +1,63 @@ +import asyncio +from pathlib import Path, PurePath + +from ..config import Config +from ..crawler import Crawler, CrawlerSection, anoncritical + + +class LocalCrawlerSection(CrawlerSection): + def path(self) -> Path: + value = self.s.get("path") + if value is None: + self.missing_value("path") + return Path(value).expanduser() + + +class LocalCrawler(Crawler): + def __init__( + self, + name: str, + config: Config, + section: LocalCrawlerSection, + ): + super().__init__(name, config, section) + + self._path = section.path() + + async def crawl(self) -> None: + await self._crawl_path(self._path, PurePath()) + if self.error_free: + self.cleanup() + + @anoncritical + async def _crawl_path(self, path: Path, pure: PurePath) -> None: + if path.is_dir(): + await self._crawl_dir(path, pure) + elif path.is_file(): + await self._crawl_file(path, pure) + + async def _crawl_dir(self, path: Path, pure: PurePath) -> None: + tasks = [] + async with self.crawl_bar(pure): + for child in path.iterdir(): + pure_child = pure / child.name + tasks.append(self._crawl_path(child, pure_child)) + await asyncio.gather(*tasks) + + async def _crawl_file(self, path: Path, pure: PurePath) -> None: + async with self.download_bar(path) as bar: + bar.set_total(path.stat().st_size) + + dl = await self.download(pure) + if not dl: + return + + async with dl as sink: + with open(path, "rb") as f: + while True: + data = f.read(1024**2) + if len(data) == 0: + break + sink.file.write(data) + bar.advance(len(data)) + sink.done() diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 9276069..c875574 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -294,7 +294,7 @@ class OutputDirectory: info = DownloadInfo(path, local_path, tmp_path, heuristics, on_conflict) try: - file = open(tmp_path, "bx") + file = open(tmp_path, "xb") return self._sink_context_manager(file, info) except FileExistsError: pass # Try again diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 54356c1..7cdbfa0 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -26,7 +26,7 @@ class Pferd: if crawler_constructor is None: abort = True t = escape(repr(crawler_type)) - print(f"[red]Error: Unknown type {t}") + print(f"[red]Error: Unknown crawler type {t}") continue crawler = crawler_constructor(name, self._config, section) diff --git a/PFERD/report.py b/PFERD/report.py index 38e8130..b98c90c 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -28,6 +28,15 @@ class MarkConflictException(Exception): collides_with: PurePath +# TODO Use PurePath.is_relative_to when updating to 3.9 +def is_relative_to(a: PurePath, b: PurePath) -> bool: + try: + a.relative_to(b) + return True + except ValueError: + return False + + class Report: """ A report of a synchronization. Includes all files found by the crawler, as @@ -53,7 +62,7 @@ class Report: if path == known_path: raise MarkDuplicateException(path) - if path.relative_to(known_path) or known_path.relative_to(path): + if is_relative_to(path, known_path) or is_relative_to(known_path, path): raise MarkConflictException(path, known_path) self.known_files.add(path)