mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Use raw paths for --debug-transforms
Previously, the already-transformed paths were used, which meant that --debug-transforms was cumbersome to use (as you had to remove all transforms and crawl once before getting useful results).
This commit is contained in:
parent
64a2960751
commit
7b062883f6
@ -264,6 +264,7 @@ class Crawler(ABC):
|
|||||||
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
||||||
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
||||||
path = self._deduplicator.mark(path)
|
path = self._deduplicator.mark(path)
|
||||||
|
self._output_dir.report.found(path)
|
||||||
|
|
||||||
if self._transformer.transform(path) is None:
|
if self._transformer.transform(path) is None:
|
||||||
log.explain("Answer: No")
|
log.explain("Answer: No")
|
||||||
@ -282,6 +283,7 @@ class Crawler(ABC):
|
|||||||
) -> Optional[DownloadToken]:
|
) -> Optional[DownloadToken]:
|
||||||
log.explain_topic(f"Decision: Download {fmt_path(path)}")
|
log.explain_topic(f"Decision: Download {fmt_path(path)}")
|
||||||
path = self._deduplicator.mark(path)
|
path = self._deduplicator.mark(path)
|
||||||
|
self._output_dir.report.found(path)
|
||||||
|
|
||||||
transformed_path = self._transformer.transform(path)
|
transformed_path = self._transformer.transform(path)
|
||||||
if transformed_path is None:
|
if transformed_path is None:
|
||||||
@ -339,7 +341,7 @@ class Crawler(ABC):
|
|||||||
return
|
return
|
||||||
|
|
||||||
seen: Set[PurePath] = set()
|
seen: Set[PurePath] = set()
|
||||||
for known in sorted(self.prev_report.known_files):
|
for known in sorted(self.prev_report.found_paths):
|
||||||
looking_at = list(reversed(known.parents)) + [known]
|
looking_at = list(reversed(known.parents)) + [known]
|
||||||
for path in looking_at:
|
for path in looking_at:
|
||||||
if path in seen:
|
if path in seen:
|
||||||
|
@ -50,12 +50,22 @@ class Report:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
# Paths found by the crawler, untransformed
|
||||||
|
self.found_paths: Set[PurePath] = set()
|
||||||
|
|
||||||
|
# Files reserved for metadata files (e. g. the report file or cookies)
|
||||||
|
# that can't be overwritten by user transforms and won't be cleaned up
|
||||||
|
# at the end.
|
||||||
self.reserved_files: Set[PurePath] = set()
|
self.reserved_files: Set[PurePath] = set()
|
||||||
|
|
||||||
|
# Files found by the crawler, transformed. Only includes files that
|
||||||
|
# were downloaded (or a download was attempted)
|
||||||
self.known_files: Set[PurePath] = set()
|
self.known_files: Set[PurePath] = set()
|
||||||
|
|
||||||
self.added_files: Set[PurePath] = set()
|
self.added_files: Set[PurePath] = set()
|
||||||
self.changed_files: Set[PurePath] = set()
|
self.changed_files: Set[PurePath] = set()
|
||||||
self.deleted_files: Set[PurePath] = set()
|
self.deleted_files: Set[PurePath] = set()
|
||||||
|
# Files that should have been deleted by the cleanup but weren't
|
||||||
self.not_deleted_files: Set[PurePath] = set()
|
self.not_deleted_files: Set[PurePath] = set()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -84,6 +94,8 @@ class Report:
|
|||||||
raise ReportLoadError("Incorrect format: Root is not an object")
|
raise ReportLoadError("Incorrect format: Root is not an object")
|
||||||
|
|
||||||
self = cls()
|
self = cls()
|
||||||
|
for elem in self._get_list_of_strs(data, "found"):
|
||||||
|
self.found(PurePath(elem))
|
||||||
for elem in self._get_list_of_strs(data, "reserved"):
|
for elem in self._get_list_of_strs(data, "reserved"):
|
||||||
self.mark_reserved(PurePath(elem))
|
self.mark_reserved(PurePath(elem))
|
||||||
for elem in self._get_list_of_strs(data, "known"):
|
for elem in self._get_list_of_strs(data, "known"):
|
||||||
@ -105,6 +117,7 @@ class Report:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
|
"found": [str(path) for path in sorted(self.found_paths)],
|
||||||
"reserved": [str(path) for path in sorted(self.reserved_files)],
|
"reserved": [str(path) for path in sorted(self.reserved_files)],
|
||||||
"known": [str(path) for path in sorted(self.known_files)],
|
"known": [str(path) for path in sorted(self.known_files)],
|
||||||
"added": [str(path) for path in sorted(self.added_files)],
|
"added": [str(path) for path in sorted(self.added_files)],
|
||||||
@ -117,6 +130,9 @@ class Report:
|
|||||||
json.dump(data, f, indent=2, sort_keys=True)
|
json.dump(data, f, indent=2, sort_keys=True)
|
||||||
f.write("\n") # json.dump doesn't do this
|
f.write("\n") # json.dump doesn't do this
|
||||||
|
|
||||||
|
def found(self, path: PurePath) -> None:
|
||||||
|
self.found_paths.add(path)
|
||||||
|
|
||||||
def mark_reserved(self, path: PurePath) -> None:
|
def mark_reserved(self, path: PurePath) -> None:
|
||||||
if path in self.marked:
|
if path in self.marked:
|
||||||
raise RuntimeError("Trying to reserve an already reserved file")
|
raise RuntimeError("Trying to reserve an already reserved file")
|
||||||
|
Loading…
Reference in New Issue
Block a user