Implement output directory

This commit is contained in:
Joscha 2021-05-05 18:08:34 +02:00
parent fde811ae5a
commit bbfdadc463
4 changed files with 381 additions and 12 deletions

View File

@ -10,6 +10,7 @@ from rich.markup import escape
from .conductor import ProgressBar, TerminalConductor from .conductor import ProgressBar, TerminalConductor
from .config import Config from .config import Config
from .limiter import Limiter from .limiter import Limiter
from .output_dir import OnConflict, OutputDirectory, Redownload
from .transformer import RuleParseException, Transformer from .transformer import RuleParseException, Transformer
@ -44,8 +45,11 @@ class Crawler(ABC):
e.pretty_print() e.pretty_print()
raise CrawlerLoadException() raise CrawlerLoadException()
# working_dir = Path(section.get("working_dir", "")) output_dir = config.working_dir / section.get("output_dir", name)
# output_dir = working_dir / section.get("output_dir", name) redownload = Redownload.NEVER_SMART
on_conflict = OnConflict.PROMPT
self._output_dir = OutputDirectory(
output_dir, redownload, on_conflict, self._conductor)
def print(self, text: str) -> None: def print(self, text: str) -> None:
""" """

365
PFERD/output_dir.py Normal file
View File

@ -0,0 +1,365 @@
import filecmp
import os
import random
import shutil
import string
from contextlib import asynccontextmanager
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import AsyncContextManager, AsyncIterator, BinaryIO, Optional
from .conductor import TerminalConductor
from .report import MarkConflictException, MarkDuplicateException, Report
from .utils import prompt_yes_no
SUFFIX_CHARS = string.ascii_lowercase + string.digits
SUFFIX_LENGTH = 6
TRIES = 5
class OutputDirException(Exception):
pass
class Redownload(Enum):
NEVER = "never"
NEVER_SMART = "never-smart"
ALWAYS = "always"
ALWAYS_SMART = "always-smart"
class OnConflict(Enum):
PROMPT = "prompt"
LOCAL_FIRST = "local-first"
REMOTE_FIRST = "remote-first"
NO_DELETE = "no-delete"
@dataclass
class Heuristics:
mtime: Optional[datetime]
class FileSink:
def __init__(self, file: BinaryIO):
self._file = file
self._done = False
@property
def file(self) -> BinaryIO:
return self._file
def done(self) -> None:
self._done = True
def is_done(self) -> bool:
return self._done
@dataclass
class DownloadInfo:
path: PurePath
local_path: Path
tmp_path: Path
heuristics: Heuristics
on_conflict: OnConflict
success: bool = False
class OutputDirectory:
def __init__(
self,
root: Path,
redownload: Redownload,
on_conflict: OnConflict,
conductor: TerminalConductor,
):
self._root = root
self._redownload = redownload
self._on_conflict = on_conflict
self._conductor = conductor
self._report = Report()
def _mark(self, path: PurePath) -> None:
"""
May throw an OutputDirException
"""
try:
self._report.mark(path)
except MarkDuplicateException:
msg = "Another file has already been placed here."
raise OutputDirException(msg)
except MarkConflictException as e:
msg = f"Collides with other file: {e.collides_with}"
raise OutputDirException(msg)
def _resolve(self, path: PurePath) -> Path:
"""
May throw an OutputDirException.
"""
if ".." in path.parts:
msg = f"Path {path} contains forbidden '..'"
raise OutputDirException(msg)
return self._root / path
def _should_download(
self,
local_path: Path,
heuristics: Heuristics,
redownload: Redownload,
) -> bool:
# If we don't have a *file* at the local path, we'll always redownload
# since we know that the remote is different from the local files. This
# includes the case where no local file exists.
if not local_path.is_file():
return True
if redownload == Redownload.NEVER:
return False
elif redownload == Redownload.ALWAYS:
return True
stat = local_path.stat()
remote_newer = None
if mtime := heuristics.mtime:
remote_newer = mtime.timestamp() > stat.st_mtime
if redownload == Redownload.NEVER_SMART:
if remote_newer is None:
return False
else:
return remote_newer
elif redownload == Redownload.ALWAYS_SMART:
if remote_newer is None:
return True
else:
return not remote_newer
# This should never be reached
raise ValueError(f"{redownload!r} is not a valid redownload policy")
# The following conflict resolution functions all return False if the local
# file(s) should be kept and True if they should be replaced by the remote
# files.
async def _conflict_lfrf(
self,
on_conflict: OnConflict,
path: PurePath,
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with self._conductor.exclusive_output():
prompt = f"Replace {path} with remote file?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
elif on_conflict == OnConflict.REMOTE_FIRST:
return True
elif on_conflict == OnConflict.NO_DELETE:
return True
# This should never be reached
raise ValueError(f"{on_conflict!r} is not a valid conflict policy")
async def _conflict_ldrf(
self,
on_conflict: OnConflict,
path: PurePath,
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with self._conductor.exclusive_output():
prompt = f"Recursively delete {path} and replace with remote file?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
elif on_conflict == OnConflict.REMOTE_FIRST:
return True
elif on_conflict == OnConflict.NO_DELETE:
return False
# This should never be reached
raise ValueError(f"{on_conflict!r} is not a valid conflict policy")
async def _conflict_lfrd(
self,
on_conflict: OnConflict,
path: PurePath,
parent: PurePath,
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with self._conductor.exclusive_output():
prompt = f"Delete {parent} so remote file {path} can be downloaded?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
elif on_conflict == OnConflict.REMOTE_FIRST:
return True
elif on_conflict == OnConflict.NO_DELETE:
return False
# This should never be reached
raise ValueError(f"{on_conflict!r} is not a valid conflict policy")
async def _conflict_delete_lf(
self,
on_conflict: OnConflict,
path: PurePath,
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with self._conductor.exclusive_output():
prompt = f"Delete {path}?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
elif on_conflict == OnConflict.REMOTE_FIRST:
return True
elif on_conflict == OnConflict.NO_DELETE:
return False
# This should never be reached
raise ValueError(f"{on_conflict!r} is not a valid conflict policy")
def _tmp_path(self, base: Path, suffix_length: int) -> Path:
prefix = "" if base.name.startswith(".") else "."
suffix = random.choices(SUFFIX_CHARS, k=suffix_length)
name = f"{prefix}{base.name}.tmp.{suffix}"
return base.parent / name
@asynccontextmanager
async def _sink_context_manager(
self,
file: BinaryIO,
info: DownloadInfo,
) -> AsyncIterator[FileSink]:
sink = FileSink(file)
try:
with file:
yield sink
finally:
info.success = sink.is_done()
await self._after_download(info)
async def download(
self,
path: PurePath,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
) -> Optional[AsyncContextManager[FileSink]]:
"""
May throw an OutputDirException.
"""
heuristics = Heuristics(mtime)
redownload = self._redownload if redownload is None else redownload
on_conflict = self._on_conflict if on_conflict is None else on_conflict
local_path = self._resolve(path)
self._mark(path)
if not self._should_download(local_path, heuristics, redownload):
return None
# Detect and solve local-dir-remote-file conflict
if local_path.is_dir():
if await self._conflict_ldrf(on_conflict, path):
shutil.rmtree(local_path)
else:
return None
# Detect and solve local-file-remote-dir conflict
for parent in path.parents:
local_parent = self._resolve(parent)
if local_parent.exists() and not local_parent.is_dir():
if await self._conflict_lfrd(on_conflict, path, parent):
local_parent.unlink()
break
else:
return None
# Ensure parent directory exists
local_path.parent.mkdir(parents=True, exist_ok=True)
# Create tmp file
for attempt in range(TRIES):
suffix_length = SUFFIX_LENGTH + 2 * attempt
tmp_path = self._tmp_path(local_path, suffix_length)
info = DownloadInfo(path, local_path, tmp_path,
heuristics, on_conflict)
try:
file = open(tmp_path, "bx")
return self._sink_context_manager(file, info)
except FileExistsError:
pass # Try again
return None
async def _after_download(self, info: DownloadInfo) -> None:
changed = False
if not info.success:
info.tmp_path.unlink()
return
# Solve conflicts arising from existing local file
if info.local_path.exists():
changed = True
if filecmp.cmp(info.local_path, info.tmp_path):
info.tmp_path.unlink()
return
if not await self._conflict_lfrf(info.on_conflict, info.path):
info.tmp_path.unlink()
return
# Modify metadata if necessary
if mtime := info.heuristics.mtime:
# TODO Pick an implementation
# Rounding up to avoid inaccuracies in how the OS stores timestamps
# mtimestamp = math.ceil(mtime.timestamp())
mtimestamp = mtime.timestamp()
os.utime(info.tmp_path, times=(mtimestamp, mtimestamp))
info.tmp_path.replace(info.local_path)
if changed:
self._report.change_file(info.path)
else:
self._report.add_file(info.path)
def cleanup(self) -> None:
self._cleanup_dir(self._root, PurePath())
def _cleanup(self, path: Path, pure: PurePath) -> None:
if path.is_dir():
self._cleanup_dir(path, pure)
elif path.is_file():
self._cleanup_file(path, pure)
def _cleanup_dir(self, path: Path, pure: PurePath) -> None:
for child in path.iterdir():
pure_child = pure / child.name
self._cleanup(child, pure_child)
try:
path.rmdir()
except OSError:
pass
def _cleanup_file(self, path: Path, pure: PurePath) -> None:
if self._report.marked(pure):
return
if self._conflict_delete_lf(self._on_conflict, pure):
try:
path.unlink()
self._report.delete_file(pure)
except OSError:
pass

View File

@ -25,9 +25,6 @@ class Pferd:
crawler_constructor = CRAWLERS.get(crawler_type) crawler_constructor = CRAWLERS.get(crawler_type)
if crawler_constructor is None: if crawler_constructor is None:
abort = True abort = True
if crawler_type is None:
print("[red]Error: No type")
else:
t = escape(repr(crawler_type)) t = escape(repr(crawler_type))
print(f"[red]Error: Unknown type {t}") print(f"[red]Error: Unknown type {t}")
continue continue

View File

@ -34,14 +34,14 @@ class Report:
well as the set of changes made to local files. well as the set of changes made to local files.
""" """
def __init__(self): def __init__(self) -> None:
self.known_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set()
self.new_files: Set[PurePath] = set() self.new_files: Set[PurePath] = set()
self.changed_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set()
self.deleted_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set()
def mark(self, path: PurePath): def mark(self, path: PurePath) -> None:
""" """
Mark a previously unknown file as known. Mark a previously unknown file as known.
@ -58,21 +58,24 @@ class Report:
self.known_files.add(path) self.known_files.add(path)
def add_file(self, path: PurePath): def marked(self, path: PurePath) -> bool:
return path in self.known_files
def add_file(self, path: PurePath) -> None:
""" """
Unlike mark(), this function accepts any paths. Unlike mark(), this function accepts any paths.
""" """
self.new_files.add(path) self.new_files.add(path)
def change_file(self, path: PurePath): def change_file(self, path: PurePath) -> None:
""" """
Unlike mark(), this function accepts any paths. Unlike mark(), this function accepts any paths.
""" """
self.changed_files.add(path) self.changed_files.add(path)
def delete_file(self, path: PurePath): def delete_file(self, path: PurePath) -> None:
""" """
Unlike mark(), this function accepts any paths. Unlike mark(), this function accepts any paths.
""" """