Add local file crawler

This commit is contained in:
Joscha 2021-05-06 01:02:40 +02:00
parent 273d56c39a
commit 60cd9873bc
7 changed files with 104 additions and 13 deletions

View File

@ -14,6 +14,9 @@ class ProgressBar:
def advance(self, amount: float = 1) -> None: def advance(self, amount: float = 1) -> None:
self._progress.advance(self._taskid, advance=amount) self._progress.advance(self._taskid, advance=amount)
def set_total(self, total) -> None:
self._progress.update(self._taskid, total=total)
class TerminalConductor: class TerminalConductor:
def __init__(self) -> None: def __init__(self) -> None:

View File

@ -1,16 +1,17 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from datetime import datetime
from pathlib import Path, PurePath from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated # TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, from typing import (Any, AsyncContextManager, AsyncIterator, Callable,
Callable, Optional, Protocol, TypeVar) Coroutine, Optional, Protocol, TypeVar)
from rich.markup import escape from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor from .conductor import ProgressBar, TerminalConductor
from .config import Config, Section from .config import Config, Section
from .limiter import Limiter from .limiter import Limiter
from .output_dir import OnConflict, OutputDirectory, Redownload from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload
from .transformer import RuleParseException, Transformer from .transformer import RuleParseException, Transformer
@ -37,7 +38,7 @@ def noncritical(f: Wrapped) -> Wrapped:
f(self, *args, **kwargs) f(self, *args, **kwargs)
except Exception as e: except Exception as e:
self.print(f"[red]Something went wrong: {escape(str(e))}") self.print(f"[red]Something went wrong: {escape(str(e))}")
self._error_free = False self.error_free = False
return wrapper # type: ignore return wrapper # type: ignore
@ -61,7 +62,7 @@ class ACrawlerMemberFunction(Protocol):
__self: "Crawler", __self: "Crawler",
*__args: Any, *__args: Any,
**__kwargs: Any, **__kwargs: Any,
) -> Awaitable[None]: ) -> Coroutine[Any, Any, None]:
pass pass
@ -74,7 +75,7 @@ def anoncritical(f: AWrapped) -> AWrapped:
await f(self, *args, **kwargs) await f(self, *args, **kwargs)
except Exception as e: except Exception as e:
self.print(f"[red]Something went wrong: {escape(str(e))}") self.print(f"[red]Something went wrong: {escape(str(e))}")
self._error_free = False self.error_free = False
return wrapper # type: ignore return wrapper # type: ignore
@ -94,7 +95,7 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
class CrawlerSection(Section): class CrawlerSection(Section):
def output_dir(self, name: str) -> Path: def output_dir(self, name: str) -> Path:
return Path(self.s.get("output_dir", name)) return Path(self.s.get("output_dir", name)).expanduser()
def redownload(self) -> Redownload: def redownload(self) -> Redownload:
value = self.s.get("redownload", "never-smart") value = self.s.get("redownload", "never-smart")
@ -158,7 +159,7 @@ class Crawler(ABC):
self._conductor, self._conductor,
) )
self._error_free = False self.error_free = False
def print(self, text: str) -> None: def print(self, text: str) -> None:
""" """
@ -203,11 +204,24 @@ class Crawler(ABC):
def download_bar( def download_bar(
self, self,
path: PurePath, path: PurePath,
size: int, total: Optional[int] = None,
) -> AsyncContextManager[ProgressBar]: ) -> AsyncContextManager[ProgressBar]:
pathstr = escape(str(path)) pathstr = escape(str(path))
desc = f"[bold green]Downloading[/bold green] {pathstr}" desc = f"[bold green]Downloading[/bold green] {pathstr}"
return self.progress_bar(desc, total=size) return self.progress_bar(desc, total=total)
async def download(
self,
path: PurePath,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
) -> Optional[AsyncContextManager[FileSink]]:
return await self._output_dir.download(
path, mtime, redownload, on_conflict)
async def cleanup(self) -> None:
await self._output_dir.cleanup()
async def run(self) -> None: async def run(self) -> None:
""" """

View File

@ -4,7 +4,9 @@ from typing import Callable, Dict
from ..config import Config from ..config import Config
from ..crawler import Crawler, CrawlerSection from ..crawler import Crawler, CrawlerSection
from .dummy import DummyCrawler from .dummy import DummyCrawler
from .local import LocalCrawler, LocalCrawlerSection
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
"dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)), "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
"local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)),
} }

63
PFERD/crawlers/local.py Normal file
View File

@ -0,0 +1,63 @@
import asyncio
from pathlib import Path, PurePath
from ..config import Config
from ..crawler import Crawler, CrawlerSection, anoncritical
class LocalCrawlerSection(CrawlerSection):
def path(self) -> Path:
value = self.s.get("path")
if value is None:
self.missing_value("path")
return Path(value).expanduser()
class LocalCrawler(Crawler):
def __init__(
self,
name: str,
config: Config,
section: LocalCrawlerSection,
):
super().__init__(name, config, section)
self._path = section.path()
async def crawl(self) -> None:
await self._crawl_path(self._path, PurePath())
if self.error_free:
self.cleanup()
@anoncritical
async def _crawl_path(self, path: Path, pure: PurePath) -> None:
if path.is_dir():
await self._crawl_dir(path, pure)
elif path.is_file():
await self._crawl_file(path, pure)
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
tasks = []
async with self.crawl_bar(pure):
for child in path.iterdir():
pure_child = pure / child.name
tasks.append(self._crawl_path(child, pure_child))
await asyncio.gather(*tasks)
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
async with self.download_bar(path) as bar:
bar.set_total(path.stat().st_size)
dl = await self.download(pure)
if not dl:
return
async with dl as sink:
with open(path, "rb") as f:
while True:
data = f.read(1024**2)
if len(data) == 0:
break
sink.file.write(data)
bar.advance(len(data))
sink.done()

View File

@ -294,7 +294,7 @@ class OutputDirectory:
info = DownloadInfo(path, local_path, tmp_path, info = DownloadInfo(path, local_path, tmp_path,
heuristics, on_conflict) heuristics, on_conflict)
try: try:
file = open(tmp_path, "bx") file = open(tmp_path, "xb")
return self._sink_context_manager(file, info) return self._sink_context_manager(file, info)
except FileExistsError: except FileExistsError:
pass # Try again pass # Try again

View File

@ -26,7 +26,7 @@ class Pferd:
if crawler_constructor is None: if crawler_constructor is None:
abort = True abort = True
t = escape(repr(crawler_type)) t = escape(repr(crawler_type))
print(f"[red]Error: Unknown type {t}") print(f"[red]Error: Unknown crawler type {t}")
continue continue
crawler = crawler_constructor(name, self._config, section) crawler = crawler_constructor(name, self._config, section)

View File

@ -28,6 +28,15 @@ class MarkConflictException(Exception):
collides_with: PurePath collides_with: PurePath
# TODO Use PurePath.is_relative_to when updating to 3.9
def is_relative_to(a: PurePath, b: PurePath) -> bool:
try:
a.relative_to(b)
return True
except ValueError:
return False
class Report: class Report:
""" """
A report of a synchronization. Includes all files found by the crawler, as A report of a synchronization. Includes all files found by the crawler, as
@ -53,7 +62,7 @@ class Report:
if path == known_path: if path == known_path:
raise MarkDuplicateException(path) raise MarkDuplicateException(path)
if path.relative_to(known_path) or known_path.relative_to(path): if is_relative_to(path, known_path) or is_relative_to(known_path, path):
raise MarkConflictException(path, known_path) raise MarkConflictException(path, known_path)
self.known_files.add(path) self.known_files.add(path)