Add local file crawler

This commit is contained in:
Joscha 2021-05-06 01:02:40 +02:00
parent 273d56c39a
commit 60cd9873bc
7 changed files with 104 additions and 13 deletions

View File

@ -14,6 +14,9 @@ class ProgressBar:
def advance(self, amount: float = 1) -> None:
self._progress.advance(self._taskid, advance=amount)
def set_total(self, total) -> None:
self._progress.update(self._taskid, total=total)
class TerminalConductor:
def __init__(self) -> None:

View File

@ -1,16 +1,17 @@
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from datetime import datetime
from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
Callable, Optional, Protocol, TypeVar)
from typing import (Any, AsyncContextManager, AsyncIterator, Callable,
Coroutine, Optional, Protocol, TypeVar)
from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor
from .config import Config, Section
from .limiter import Limiter
from .output_dir import OnConflict, OutputDirectory, Redownload
from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload
from .transformer import RuleParseException, Transformer
@ -37,7 +38,7 @@ def noncritical(f: Wrapped) -> Wrapped:
f(self, *args, **kwargs)
except Exception as e:
self.print(f"[red]Something went wrong: {escape(str(e))}")
self._error_free = False
self.error_free = False
return wrapper # type: ignore
@ -61,7 +62,7 @@ class ACrawlerMemberFunction(Protocol):
__self: "Crawler",
*__args: Any,
**__kwargs: Any,
) -> Awaitable[None]:
) -> Coroutine[Any, Any, None]:
pass
@ -74,7 +75,7 @@ def anoncritical(f: AWrapped) -> AWrapped:
await f(self, *args, **kwargs)
except Exception as e:
self.print(f"[red]Something went wrong: {escape(str(e))}")
self._error_free = False
self.error_free = False
return wrapper # type: ignore
@ -94,7 +95,7 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
class CrawlerSection(Section):
def output_dir(self, name: str) -> Path:
return Path(self.s.get("output_dir", name))
return Path(self.s.get("output_dir", name)).expanduser()
def redownload(self) -> Redownload:
value = self.s.get("redownload", "never-smart")
@ -158,7 +159,7 @@ class Crawler(ABC):
self._conductor,
)
self._error_free = False
self.error_free = False
def print(self, text: str) -> None:
"""
@ -203,11 +204,24 @@ class Crawler(ABC):
def download_bar(
self,
path: PurePath,
size: int,
total: Optional[int] = None,
) -> AsyncContextManager[ProgressBar]:
pathstr = escape(str(path))
desc = f"[bold green]Downloading[/bold green] {pathstr}"
return self.progress_bar(desc, total=size)
return self.progress_bar(desc, total=total)
async def download(
self,
path: PurePath,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
) -> Optional[AsyncContextManager[FileSink]]:
return await self._output_dir.download(
path, mtime, redownload, on_conflict)
async def cleanup(self) -> None:
await self._output_dir.cleanup()
async def run(self) -> None:
"""

View File

@ -4,7 +4,9 @@ from typing import Callable, Dict
from ..config import Config
from ..crawler import Crawler, CrawlerSection
from .dummy import DummyCrawler
from .local import LocalCrawler, LocalCrawlerSection
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
"dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
"local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)),
}

63
PFERD/crawlers/local.py Normal file
View File

@ -0,0 +1,63 @@
import asyncio
from pathlib import Path, PurePath
from ..config import Config
from ..crawler import Crawler, CrawlerSection, anoncritical
class LocalCrawlerSection(CrawlerSection):
def path(self) -> Path:
value = self.s.get("path")
if value is None:
self.missing_value("path")
return Path(value).expanduser()
class LocalCrawler(Crawler):
def __init__(
self,
name: str,
config: Config,
section: LocalCrawlerSection,
):
super().__init__(name, config, section)
self._path = section.path()
async def crawl(self) -> None:
await self._crawl_path(self._path, PurePath())
if self.error_free:
self.cleanup()
@anoncritical
async def _crawl_path(self, path: Path, pure: PurePath) -> None:
if path.is_dir():
await self._crawl_dir(path, pure)
elif path.is_file():
await self._crawl_file(path, pure)
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
tasks = []
async with self.crawl_bar(pure):
for child in path.iterdir():
pure_child = pure / child.name
tasks.append(self._crawl_path(child, pure_child))
await asyncio.gather(*tasks)
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
async with self.download_bar(path) as bar:
bar.set_total(path.stat().st_size)
dl = await self.download(pure)
if not dl:
return
async with dl as sink:
with open(path, "rb") as f:
while True:
data = f.read(1024**2)
if len(data) == 0:
break
sink.file.write(data)
bar.advance(len(data))
sink.done()

View File

@ -294,7 +294,7 @@ class OutputDirectory:
info = DownloadInfo(path, local_path, tmp_path,
heuristics, on_conflict)
try:
file = open(tmp_path, "bx")
file = open(tmp_path, "xb")
return self._sink_context_manager(file, info)
except FileExistsError:
pass # Try again

View File

@ -26,7 +26,7 @@ class Pferd:
if crawler_constructor is None:
abort = True
t = escape(repr(crawler_type))
print(f"[red]Error: Unknown type {t}")
print(f"[red]Error: Unknown crawler type {t}")
continue
crawler = crawler_constructor(name, self._config, section)

View File

@ -28,6 +28,15 @@ class MarkConflictException(Exception):
collides_with: PurePath
# TODO Use PurePath.is_relative_to when updating to 3.9
def is_relative_to(a: PurePath, b: PurePath) -> bool:
try:
a.relative_to(b)
return True
except ValueError:
return False
class Report:
"""
A report of a synchronization. Includes all files found by the crawler, as
@ -53,7 +62,7 @@ class Report:
if path == known_path:
raise MarkDuplicateException(path)
if path.relative_to(known_path) or known_path.relative_to(path):
if is_relative_to(path, known_path) or is_relative_to(known_path, path):
raise MarkConflictException(path, known_path)
self.known_files.add(path)