mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Load and store reports
This commit is contained in:
parent
6e9f8fd391
commit
6ca0ecdf05
@ -11,7 +11,7 @@ from ..config import Config, Section
|
|||||||
from ..limiter import Limiter
|
from ..limiter import Limiter
|
||||||
from ..logging import ProgressBar, log
|
from ..logging import ProgressBar, log
|
||||||
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
|
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
|
||||||
from ..report import MarkConflictError, MarkDuplicateError
|
from ..report import MarkConflictError, MarkDuplicateError, Report
|
||||||
from ..transformer import Transformer
|
from ..transformer import Transformer
|
||||||
from ..utils import ReusableAsyncContextManager, fmt_path
|
from ..utils import ReusableAsyncContextManager, fmt_path
|
||||||
|
|
||||||
@ -229,6 +229,14 @@ class Crawler(ABC):
|
|||||||
section.on_conflict(),
|
section.on_conflict(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def report(self) -> Report:
|
||||||
|
return self._output_dir.report
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prev_report(self) -> Optional[Report]:
|
||||||
|
return self._output_dir.prev_report
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
|
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
|
||||||
"""
|
"""
|
||||||
@ -298,8 +306,10 @@ class Crawler(ABC):
|
|||||||
|
|
||||||
with log.show_progress():
|
with log.show_progress():
|
||||||
self._output_dir.prepare()
|
self._output_dir.prepare()
|
||||||
|
self._output_dir.load_prev_report()
|
||||||
await self._run()
|
await self._run()
|
||||||
await self._cleanup()
|
await self._cleanup()
|
||||||
|
self._output_dir.store_report()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import filecmp
|
import filecmp
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
@ -13,7 +14,7 @@ from typing import BinaryIO, Iterator, Optional, Tuple
|
|||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
from .logging import log
|
from .logging import log
|
||||||
from .report import Report
|
from .report import Report, ReportLoadError
|
||||||
from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no
|
from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no
|
||||||
|
|
||||||
SUFFIX_CHARS = string.ascii_lowercase + string.digits
|
SUFFIX_CHARS = string.ascii_lowercase + string.digits
|
||||||
@ -134,6 +135,8 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]):
|
|||||||
|
|
||||||
|
|
||||||
class OutputDirectory:
|
class OutputDirectory:
|
||||||
|
REPORT_FILE = PurePath(".report")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
root: Path,
|
root: Path,
|
||||||
@ -144,7 +147,19 @@ class OutputDirectory:
|
|||||||
self._redownload = redownload
|
self._redownload = redownload
|
||||||
self._on_conflict = on_conflict
|
self._on_conflict = on_conflict
|
||||||
|
|
||||||
|
self._report_path = self.resolve(self.REPORT_FILE)
|
||||||
self._report = Report()
|
self._report = Report()
|
||||||
|
self._prev_report: Optional[Report] = None
|
||||||
|
|
||||||
|
self.register_reserved(self.REPORT_FILE)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def report(self) -> Report:
|
||||||
|
return self._report
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prev_report(self) -> Optional[Report]:
|
||||||
|
return self._prev_report
|
||||||
|
|
||||||
def prepare(self) -> None:
|
def prepare(self) -> None:
|
||||||
log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}")
|
log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}")
|
||||||
@ -452,3 +467,21 @@ class OutputDirectory:
|
|||||||
self._report.delete_file(pure)
|
self._report.delete_file(pure)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def load_prev_report(self) -> None:
|
||||||
|
log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}")
|
||||||
|
try:
|
||||||
|
self._prev_report = Report.load(self._report_path)
|
||||||
|
log.explain("Loaded report successfully")
|
||||||
|
except (OSError, json.JSONDecodeError, ReportLoadError) as e:
|
||||||
|
log.explain("Failed to load report")
|
||||||
|
log.explain(str(e))
|
||||||
|
|
||||||
|
def store_report(self) -> None:
|
||||||
|
log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}")
|
||||||
|
try:
|
||||||
|
self._report.store(self._report_path)
|
||||||
|
log.explain("Stored report successfully")
|
||||||
|
except OSError as e:
|
||||||
|
log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}")
|
||||||
|
log.warn_contd(str(e))
|
||||||
|
@ -1,5 +1,10 @@
|
|||||||
from pathlib import PurePath
|
import json
|
||||||
from typing import Set
|
from pathlib import Path, PurePath
|
||||||
|
from typing import Any, Dict, List, Set
|
||||||
|
|
||||||
|
|
||||||
|
class ReportLoadError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class MarkDuplicateError(Exception):
|
class MarkDuplicateError(Exception):
|
||||||
@ -48,10 +53,66 @@ class Report:
|
|||||||
self.reserved_files: Set[PurePath] = set()
|
self.reserved_files: Set[PurePath] = set()
|
||||||
self.known_files: Set[PurePath] = set()
|
self.known_files: Set[PurePath] = set()
|
||||||
|
|
||||||
self.new_files: Set[PurePath] = set()
|
self.added_files: Set[PurePath] = set()
|
||||||
self.changed_files: Set[PurePath] = set()
|
self.changed_files: Set[PurePath] = set()
|
||||||
self.deleted_files: Set[PurePath] = set()
|
self.deleted_files: Set[PurePath] = set()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]:
|
||||||
|
result: Any = data.get(key, [])
|
||||||
|
|
||||||
|
if not isinstance(result, list):
|
||||||
|
raise ReportLoadError(f"Incorrect format: {key!r} is not a list")
|
||||||
|
|
||||||
|
for elem in result:
|
||||||
|
if not isinstance(elem, str):
|
||||||
|
raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: Path) -> "Report":
|
||||||
|
"""
|
||||||
|
May raise OSError, JsonDecodeError, ReportLoadError.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open(path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
raise ReportLoadError("Incorrect format: Root is not an object")
|
||||||
|
|
||||||
|
self = cls()
|
||||||
|
for elem in self._get_list_of_strs(data, "reserved"):
|
||||||
|
self.mark_reserved(PurePath(elem))
|
||||||
|
for elem in self._get_list_of_strs(data, "known"):
|
||||||
|
self.mark(PurePath(elem))
|
||||||
|
for elem in self._get_list_of_strs(data, "added"):
|
||||||
|
self.add_file(PurePath(elem))
|
||||||
|
for elem in self._get_list_of_strs(data, "changed"):
|
||||||
|
self.change_file(PurePath(elem))
|
||||||
|
for elem in self._get_list_of_strs(data, "deleted"):
|
||||||
|
self.delete_file(PurePath(elem))
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def store(self, path: Path) -> None:
|
||||||
|
"""
|
||||||
|
May raise OSError.
|
||||||
|
"""
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"reserved": [str(path) for path in sorted(self.reserved_files)],
|
||||||
|
"known": [str(path) for path in sorted(self.known_files)],
|
||||||
|
"added": [str(path) for path in sorted(self.added_files)],
|
||||||
|
"changed": [str(path) for path in sorted(self.changed_files)],
|
||||||
|
"deleted": [str(path) for path in sorted(self.deleted_files)],
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(path, "w") as f:
|
||||||
|
json.dump(data, f, indent=2, sort_keys=True)
|
||||||
|
f.write("\n") # json.dump doesn't do this
|
||||||
|
|
||||||
def mark_reserved(self, path: PurePath) -> None:
|
def mark_reserved(self, path: PurePath) -> None:
|
||||||
self.reserved_files.add(path)
|
self.reserved_files.add(path)
|
||||||
|
|
||||||
@ -84,7 +145,7 @@ class Report:
|
|||||||
Unlike mark(), this function accepts any paths.
|
Unlike mark(), this function accepts any paths.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.new_files.add(path)
|
self.added_files.add(path)
|
||||||
|
|
||||||
def change_file(self, path: PurePath) -> None:
|
def change_file(self, path: PurePath) -> None:
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user