From 7b062883f619238b9992834c39484e9973a172f9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 12:28:11 +0200 Subject: [PATCH] Use raw paths for --debug-transforms Previously, the already-transformed paths were used, which meant that --debug-transforms was cumbersome to use (as you had to remove all transforms and crawl once before getting useful results). --- PFERD/crawl/crawler.py | 4 +++- PFERD/report.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index ce69967..e990f16 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -264,6 +264,7 @@ class Crawler(ABC): async def crawl(self, path: PurePath) -> Optional[CrawlToken]: log.explain_topic(f"Decision: Crawl {fmt_path(path)}") path = self._deduplicator.mark(path) + self._output_dir.report.found(path) if self._transformer.transform(path) is None: log.explain("Answer: No") @@ -282,6 +283,7 @@ class Crawler(ABC): ) -> Optional[DownloadToken]: log.explain_topic(f"Decision: Download {fmt_path(path)}") path = self._deduplicator.mark(path) + self._output_dir.report.found(path) transformed_path = self._transformer.transform(path) if transformed_path is None: @@ -339,7 +341,7 @@ class Crawler(ABC): return seen: Set[PurePath] = set() - for known in sorted(self.prev_report.known_files): + for known in sorted(self.prev_report.found_paths): looking_at = list(reversed(known.parents)) + [known] for path in looking_at: if path in seen: diff --git a/PFERD/report.py b/PFERD/report.py index b47490f..919bb35 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -50,12 +50,22 @@ class Report: """ def __init__(self) -> None: + # Paths found by the crawler, untransformed + self.found_paths: Set[PurePath] = set() + + # Files reserved for metadata files (e. g. the report file or cookies) + # that can't be overwritten by user transforms and won't be cleaned up + # at the end. self.reserved_files: Set[PurePath] = set() + + # Files found by the crawler, transformed. Only includes files that + # were downloaded (or a download was attempted) self.known_files: Set[PurePath] = set() self.added_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + # Files that should have been deleted by the cleanup but weren't self.not_deleted_files: Set[PurePath] = set() @staticmethod @@ -84,6 +94,8 @@ class Report: raise ReportLoadError("Incorrect format: Root is not an object") self = cls() + for elem in self._get_list_of_strs(data, "found"): + self.found(PurePath(elem)) for elem in self._get_list_of_strs(data, "reserved"): self.mark_reserved(PurePath(elem)) for elem in self._get_list_of_strs(data, "known"): @@ -105,6 +117,7 @@ class Report: """ data = { + "found": [str(path) for path in sorted(self.found_paths)], "reserved": [str(path) for path in sorted(self.reserved_files)], "known": [str(path) for path in sorted(self.known_files)], "added": [str(path) for path in sorted(self.added_files)], @@ -117,6 +130,9 @@ class Report: json.dump(data, f, indent=2, sort_keys=True) f.write("\n") # json.dump doesn't do this + def found(self, path: PurePath) -> None: + self.found_paths.add(path) + def mark_reserved(self, path: PurePath) -> None: if path in self.marked: raise RuntimeError("Trying to reserve an already reserved file")