Load and store reports

This commit is contained in:
Joscha 2021-05-23 20:46:12 +02:00
parent 6e9f8fd391
commit 6ca0ecdf05
3 changed files with 110 additions and 6 deletions

View File

@ -11,7 +11,7 @@ from ..config import Config, Section
from ..limiter import Limiter
from ..logging import ProgressBar, log
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
from ..report import MarkConflictError, MarkDuplicateError
from ..report import MarkConflictError, MarkDuplicateError, Report
from ..transformer import Transformer
from ..utils import ReusableAsyncContextManager, fmt_path
@ -229,6 +229,14 @@ class Crawler(ABC):
section.on_conflict(),
)
@property
def report(self) -> Report:
return self._output_dir.report
@property
def prev_report(self) -> Optional[Report]:
return self._output_dir.prev_report
@staticmethod
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
"""
@ -298,8 +306,10 @@ class Crawler(ABC):
with log.show_progress():
self._output_dir.prepare()
self._output_dir.load_prev_report()
await self._run()
await self._cleanup()
self._output_dir.store_report()
@abstractmethod
async def _run(self) -> None:

View File

@ -1,4 +1,5 @@
import filecmp
import json
import os
import random
import shutil
@ -13,7 +14,7 @@ from typing import BinaryIO, Iterator, Optional, Tuple
from rich.markup import escape
from .logging import log
from .report import Report
from .report import Report, ReportLoadError
from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no
SUFFIX_CHARS = string.ascii_lowercase + string.digits
@ -134,6 +135,8 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]):
class OutputDirectory:
REPORT_FILE = PurePath(".report")
def __init__(
self,
root: Path,
@ -144,7 +147,19 @@ class OutputDirectory:
self._redownload = redownload
self._on_conflict = on_conflict
self._report_path = self.resolve(self.REPORT_FILE)
self._report = Report()
self._prev_report: Optional[Report] = None
self.register_reserved(self.REPORT_FILE)
@property
def report(self) -> Report:
return self._report
@property
def prev_report(self) -> Optional[Report]:
return self._prev_report
def prepare(self) -> None:
log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}")
@ -452,3 +467,21 @@ class OutputDirectory:
self._report.delete_file(pure)
except OSError:
pass
def load_prev_report(self) -> None:
log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}")
try:
self._prev_report = Report.load(self._report_path)
log.explain("Loaded report successfully")
except (OSError, json.JSONDecodeError, ReportLoadError) as e:
log.explain("Failed to load report")
log.explain(str(e))
def store_report(self) -> None:
log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}")
try:
self._report.store(self._report_path)
log.explain("Stored report successfully")
except OSError as e:
log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}")
log.warn_contd(str(e))

View File

@ -1,5 +1,10 @@
from pathlib import PurePath
from typing import Set
import json
from pathlib import Path, PurePath
from typing import Any, Dict, List, Set
class ReportLoadError(Exception):
pass
class MarkDuplicateError(Exception):
@ -48,10 +53,66 @@ class Report:
self.reserved_files: Set[PurePath] = set()
self.known_files: Set[PurePath] = set()
self.new_files: Set[PurePath] = set()
self.added_files: Set[PurePath] = set()
self.changed_files: Set[PurePath] = set()
self.deleted_files: Set[PurePath] = set()
@staticmethod
def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]:
result: Any = data.get(key, [])
if not isinstance(result, list):
raise ReportLoadError(f"Incorrect format: {key!r} is not a list")
for elem in result:
if not isinstance(elem, str):
raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings")
return result
@classmethod
def load(cls, path: Path) -> "Report":
"""
May raise OSError, JsonDecodeError, ReportLoadError.
"""
with open(path) as f:
data = json.load(f)
if not isinstance(data, dict):
raise ReportLoadError("Incorrect format: Root is not an object")
self = cls()
for elem in self._get_list_of_strs(data, "reserved"):
self.mark_reserved(PurePath(elem))
for elem in self._get_list_of_strs(data, "known"):
self.mark(PurePath(elem))
for elem in self._get_list_of_strs(data, "added"):
self.add_file(PurePath(elem))
for elem in self._get_list_of_strs(data, "changed"):
self.change_file(PurePath(elem))
for elem in self._get_list_of_strs(data, "deleted"):
self.delete_file(PurePath(elem))
return self
def store(self, path: Path) -> None:
"""
May raise OSError.
"""
data = {
"reserved": [str(path) for path in sorted(self.reserved_files)],
"known": [str(path) for path in sorted(self.known_files)],
"added": [str(path) for path in sorted(self.added_files)],
"changed": [str(path) for path in sorted(self.changed_files)],
"deleted": [str(path) for path in sorted(self.deleted_files)],
}
with open(path, "w") as f:
json.dump(data, f, indent=2, sort_keys=True)
f.write("\n") # json.dump doesn't do this
def mark_reserved(self, path: PurePath) -> None:
self.reserved_files.add(path)
@ -84,7 +145,7 @@ class Report:
Unlike mark(), this function accepts any paths.
"""
self.new_files.add(path)
self.added_files.add(path)
def change_file(self, path: PurePath) -> None:
"""