From 6ca0ecdf05b85c1986de50724443aaabb5755506 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 20:46:12 +0200 Subject: [PATCH] Load and store reports --- PFERD/crawl/crawler.py | 12 +++++++- PFERD/output_dir.py | 35 ++++++++++++++++++++- PFERD/report.py | 69 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 110 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index a79e968..60ea708 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -11,7 +11,7 @@ from ..config import Config, Section from ..limiter import Limiter from ..logging import ProgressBar, log from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload -from ..report import MarkConflictError, MarkDuplicateError +from ..report import MarkConflictError, MarkDuplicateError, Report from ..transformer import Transformer from ..utils import ReusableAsyncContextManager, fmt_path @@ -229,6 +229,14 @@ class Crawler(ABC): section.on_conflict(), ) + @property + def report(self) -> Report: + return self._output_dir.report + + @property + def prev_report(self) -> Optional[Report]: + return self._output_dir.prev_report + @staticmethod async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: """ @@ -298,8 +306,10 @@ class Crawler(ABC): with log.show_progress(): self._output_dir.prepare() + self._output_dir.load_prev_report() await self._run() await self._cleanup() + self._output_dir.store_report() @abstractmethod async def _run(self) -> None: diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 06cfe6b..d60a312 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -1,4 +1,5 @@ import filecmp +import json import os import random import shutil @@ -13,7 +14,7 @@ from typing import BinaryIO, Iterator, Optional, Tuple from rich.markup import escape from .logging import log -from .report import Report +from .report import Report, ReportLoadError from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no SUFFIX_CHARS = string.ascii_lowercase + string.digits @@ -134,6 +135,8 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]): class OutputDirectory: + REPORT_FILE = PurePath(".report") + def __init__( self, root: Path, @@ -144,7 +147,19 @@ class OutputDirectory: self._redownload = redownload self._on_conflict = on_conflict + self._report_path = self.resolve(self.REPORT_FILE) self._report = Report() + self._prev_report: Optional[Report] = None + + self.register_reserved(self.REPORT_FILE) + + @property + def report(self) -> Report: + return self._report + + @property + def prev_report(self) -> Optional[Report]: + return self._prev_report def prepare(self) -> None: log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") @@ -452,3 +467,21 @@ class OutputDirectory: self._report.delete_file(pure) except OSError: pass + + def load_prev_report(self) -> None: + log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") + try: + self._prev_report = Report.load(self._report_path) + log.explain("Loaded report successfully") + except (OSError, json.JSONDecodeError, ReportLoadError) as e: + log.explain("Failed to load report") + log.explain(str(e)) + + def store_report(self) -> None: + log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}") + try: + self._report.store(self._report_path) + log.explain("Stored report successfully") + except OSError as e: + log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}") + log.warn_contd(str(e)) diff --git a/PFERD/report.py b/PFERD/report.py index 7d8aa85..619c621 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -1,5 +1,10 @@ -from pathlib import PurePath -from typing import Set +import json +from pathlib import Path, PurePath +from typing import Any, Dict, List, Set + + +class ReportLoadError(Exception): + pass class MarkDuplicateError(Exception): @@ -48,10 +53,66 @@ class Report: self.reserved_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set() - self.new_files: Set[PurePath] = set() + self.added_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + @staticmethod + def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: + result: Any = data.get(key, []) + + if not isinstance(result, list): + raise ReportLoadError(f"Incorrect format: {key!r} is not a list") + + for elem in result: + if not isinstance(elem, str): + raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings") + + return result + + @classmethod + def load(cls, path: Path) -> "Report": + """ + May raise OSError, JsonDecodeError, ReportLoadError. + """ + + with open(path) as f: + data = json.load(f) + + if not isinstance(data, dict): + raise ReportLoadError("Incorrect format: Root is not an object") + + self = cls() + for elem in self._get_list_of_strs(data, "reserved"): + self.mark_reserved(PurePath(elem)) + for elem in self._get_list_of_strs(data, "known"): + self.mark(PurePath(elem)) + for elem in self._get_list_of_strs(data, "added"): + self.add_file(PurePath(elem)) + for elem in self._get_list_of_strs(data, "changed"): + self.change_file(PurePath(elem)) + for elem in self._get_list_of_strs(data, "deleted"): + self.delete_file(PurePath(elem)) + + return self + + def store(self, path: Path) -> None: + """ + May raise OSError. + """ + + data = { + "reserved": [str(path) for path in sorted(self.reserved_files)], + "known": [str(path) for path in sorted(self.known_files)], + "added": [str(path) for path in sorted(self.added_files)], + "changed": [str(path) for path in sorted(self.changed_files)], + "deleted": [str(path) for path in sorted(self.deleted_files)], + } + + with open(path, "w") as f: + json.dump(data, f, indent=2, sort_keys=True) + f.write("\n") # json.dump doesn't do this + def mark_reserved(self, path: PurePath) -> None: self.reserved_files.add(path) @@ -84,7 +145,7 @@ class Report: Unlike mark(), this function accepts any paths. """ - self.new_files.add(path) + self.added_files.add(path) def change_file(self, path: PurePath) -> None: """