Load and store reports

This commit is contained in:
Joscha 2021-05-23 20:46:12 +02:00
parent 6e9f8fd391
commit 6ca0ecdf05
3 changed files with 110 additions and 6 deletions

View File

@ -11,7 +11,7 @@ from ..config import Config, Section
from ..limiter import Limiter from ..limiter import Limiter
from ..logging import ProgressBar, log from ..logging import ProgressBar, log
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
from ..report import MarkConflictError, MarkDuplicateError from ..report import MarkConflictError, MarkDuplicateError, Report
from ..transformer import Transformer from ..transformer import Transformer
from ..utils import ReusableAsyncContextManager, fmt_path from ..utils import ReusableAsyncContextManager, fmt_path
@ -229,6 +229,14 @@ class Crawler(ABC):
section.on_conflict(), section.on_conflict(),
) )
@property
def report(self) -> Report:
return self._output_dir.report
@property
def prev_report(self) -> Optional[Report]:
return self._output_dir.prev_report
@staticmethod @staticmethod
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
""" """
@ -298,8 +306,10 @@ class Crawler(ABC):
with log.show_progress(): with log.show_progress():
self._output_dir.prepare() self._output_dir.prepare()
self._output_dir.load_prev_report()
await self._run() await self._run()
await self._cleanup() await self._cleanup()
self._output_dir.store_report()
@abstractmethod @abstractmethod
async def _run(self) -> None: async def _run(self) -> None:

View File

@ -1,4 +1,5 @@
import filecmp import filecmp
import json
import os import os
import random import random
import shutil import shutil
@ -13,7 +14,7 @@ from typing import BinaryIO, Iterator, Optional, Tuple
from rich.markup import escape from rich.markup import escape
from .logging import log from .logging import log
from .report import Report from .report import Report, ReportLoadError
from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no
SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_CHARS = string.ascii_lowercase + string.digits
@ -134,6 +135,8 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]):
class OutputDirectory: class OutputDirectory:
REPORT_FILE = PurePath(".report")
def __init__( def __init__(
self, self,
root: Path, root: Path,
@ -144,7 +147,19 @@ class OutputDirectory:
self._redownload = redownload self._redownload = redownload
self._on_conflict = on_conflict self._on_conflict = on_conflict
self._report_path = self.resolve(self.REPORT_FILE)
self._report = Report() self._report = Report()
self._prev_report: Optional[Report] = None
self.register_reserved(self.REPORT_FILE)
@property
def report(self) -> Report:
return self._report
@property
def prev_report(self) -> Optional[Report]:
return self._prev_report
def prepare(self) -> None: def prepare(self) -> None:
log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}")
@ -452,3 +467,21 @@ class OutputDirectory:
self._report.delete_file(pure) self._report.delete_file(pure)
except OSError: except OSError:
pass pass
def load_prev_report(self) -> None:
log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}")
try:
self._prev_report = Report.load(self._report_path)
log.explain("Loaded report successfully")
except (OSError, json.JSONDecodeError, ReportLoadError) as e:
log.explain("Failed to load report")
log.explain(str(e))
def store_report(self) -> None:
log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}")
try:
self._report.store(self._report_path)
log.explain("Stored report successfully")
except OSError as e:
log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}")
log.warn_contd(str(e))

View File

@ -1,5 +1,10 @@
from pathlib import PurePath import json
from typing import Set from pathlib import Path, PurePath
from typing import Any, Dict, List, Set
class ReportLoadError(Exception):
pass
class MarkDuplicateError(Exception): class MarkDuplicateError(Exception):
@ -48,10 +53,66 @@ class Report:
self.reserved_files: Set[PurePath] = set() self.reserved_files: Set[PurePath] = set()
self.known_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set()
self.new_files: Set[PurePath] = set() self.added_files: Set[PurePath] = set()
self.changed_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set()
self.deleted_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set()
@staticmethod
def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]:
result: Any = data.get(key, [])
if not isinstance(result, list):
raise ReportLoadError(f"Incorrect format: {key!r} is not a list")
for elem in result:
if not isinstance(elem, str):
raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings")
return result
@classmethod
def load(cls, path: Path) -> "Report":
"""
May raise OSError, JsonDecodeError, ReportLoadError.
"""
with open(path) as f:
data = json.load(f)
if not isinstance(data, dict):
raise ReportLoadError("Incorrect format: Root is not an object")
self = cls()
for elem in self._get_list_of_strs(data, "reserved"):
self.mark_reserved(PurePath(elem))
for elem in self._get_list_of_strs(data, "known"):
self.mark(PurePath(elem))
for elem in self._get_list_of_strs(data, "added"):
self.add_file(PurePath(elem))
for elem in self._get_list_of_strs(data, "changed"):
self.change_file(PurePath(elem))
for elem in self._get_list_of_strs(data, "deleted"):
self.delete_file(PurePath(elem))
return self
def store(self, path: Path) -> None:
"""
May raise OSError.
"""
data = {
"reserved": [str(path) for path in sorted(self.reserved_files)],
"known": [str(path) for path in sorted(self.known_files)],
"added": [str(path) for path in sorted(self.added_files)],
"changed": [str(path) for path in sorted(self.changed_files)],
"deleted": [str(path) for path in sorted(self.deleted_files)],
}
with open(path, "w") as f:
json.dump(data, f, indent=2, sort_keys=True)
f.write("\n") # json.dump doesn't do this
def mark_reserved(self, path: PurePath) -> None: def mark_reserved(self, path: PurePath) -> None:
self.reserved_files.add(path) self.reserved_files.add(path)
@ -84,7 +145,7 @@ class Report:
Unlike mark(), this function accepts any paths. Unlike mark(), this function accepts any paths.
""" """
self.new_files.add(path) self.added_files.add(path)
def change_file(self, path: PurePath) -> None: def change_file(self, path: PurePath) -> None:
""" """