Use raw paths for --debug-transforms

Previously, the already-transformed paths were used, which meant that
--debug-transforms was cumbersome to use (as you had to remove all transforms
and crawl once before getting useful results).
This commit is contained in:
Joscha 2021-05-31 12:28:11 +02:00
parent 64a2960751
commit 7b062883f6
2 changed files with 19 additions and 1 deletions

View File

@ -264,6 +264,7 @@ class Crawler(ABC):
async def crawl(self, path: PurePath) -> Optional[CrawlToken]: async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
log.explain_topic(f"Decision: Crawl {fmt_path(path)}") log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
path = self._deduplicator.mark(path) path = self._deduplicator.mark(path)
self._output_dir.report.found(path)
if self._transformer.transform(path) is None: if self._transformer.transform(path) is None:
log.explain("Answer: No") log.explain("Answer: No")
@ -282,6 +283,7 @@ class Crawler(ABC):
) -> Optional[DownloadToken]: ) -> Optional[DownloadToken]:
log.explain_topic(f"Decision: Download {fmt_path(path)}") log.explain_topic(f"Decision: Download {fmt_path(path)}")
path = self._deduplicator.mark(path) path = self._deduplicator.mark(path)
self._output_dir.report.found(path)
transformed_path = self._transformer.transform(path) transformed_path = self._transformer.transform(path)
if transformed_path is None: if transformed_path is None:
@ -339,7 +341,7 @@ class Crawler(ABC):
return return
seen: Set[PurePath] = set() seen: Set[PurePath] = set()
for known in sorted(self.prev_report.known_files): for known in sorted(self.prev_report.found_paths):
looking_at = list(reversed(known.parents)) + [known] looking_at = list(reversed(known.parents)) + [known]
for path in looking_at: for path in looking_at:
if path in seen: if path in seen:

View File

@ -50,12 +50,22 @@ class Report:
""" """
def __init__(self) -> None: def __init__(self) -> None:
# Paths found by the crawler, untransformed
self.found_paths: Set[PurePath] = set()
# Files reserved for metadata files (e. g. the report file or cookies)
# that can't be overwritten by user transforms and won't be cleaned up
# at the end.
self.reserved_files: Set[PurePath] = set() self.reserved_files: Set[PurePath] = set()
# Files found by the crawler, transformed. Only includes files that
# were downloaded (or a download was attempted)
self.known_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set()
self.added_files: Set[PurePath] = set() self.added_files: Set[PurePath] = set()
self.changed_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set()
self.deleted_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set()
# Files that should have been deleted by the cleanup but weren't
self.not_deleted_files: Set[PurePath] = set() self.not_deleted_files: Set[PurePath] = set()
@staticmethod @staticmethod
@ -84,6 +94,8 @@ class Report:
raise ReportLoadError("Incorrect format: Root is not an object") raise ReportLoadError("Incorrect format: Root is not an object")
self = cls() self = cls()
for elem in self._get_list_of_strs(data, "found"):
self.found(PurePath(elem))
for elem in self._get_list_of_strs(data, "reserved"): for elem in self._get_list_of_strs(data, "reserved"):
self.mark_reserved(PurePath(elem)) self.mark_reserved(PurePath(elem))
for elem in self._get_list_of_strs(data, "known"): for elem in self._get_list_of_strs(data, "known"):
@ -105,6 +117,7 @@ class Report:
""" """
data = { data = {
"found": [str(path) for path in sorted(self.found_paths)],
"reserved": [str(path) for path in sorted(self.reserved_files)], "reserved": [str(path) for path in sorted(self.reserved_files)],
"known": [str(path) for path in sorted(self.known_files)], "known": [str(path) for path in sorted(self.known_files)],
"added": [str(path) for path in sorted(self.added_files)], "added": [str(path) for path in sorted(self.added_files)],
@ -117,6 +130,9 @@ class Report:
json.dump(data, f, indent=2, sort_keys=True) json.dump(data, f, indent=2, sort_keys=True)
f.write("\n") # json.dump doesn't do this f.write("\n") # json.dump doesn't do this
def found(self, path: PurePath) -> None:
self.found_paths.add(path)
def mark_reserved(self, path: PurePath) -> None: def mark_reserved(self, path: PurePath) -> None:
if path in self.marked: if path in self.marked:
raise RuntimeError("Trying to reserve an already reserved file") raise RuntimeError("Trying to reserve an already reserved file")