mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-31 21:02:42 +01:00 
			
		
		
		
	Use raw paths for --debug-transforms
Previously, the already-transformed paths were used, which meant that --debug-transforms was cumbersome to use (as you had to remove all transforms and crawl once before getting useful results).
This commit is contained in:
		| @@ -264,6 +264,7 @@ class Crawler(ABC): | ||||
|     async def crawl(self, path: PurePath) -> Optional[CrawlToken]: | ||||
|         log.explain_topic(f"Decision: Crawl {fmt_path(path)}") | ||||
|         path = self._deduplicator.mark(path) | ||||
|         self._output_dir.report.found(path) | ||||
|  | ||||
|         if self._transformer.transform(path) is None: | ||||
|             log.explain("Answer: No") | ||||
| @@ -282,6 +283,7 @@ class Crawler(ABC): | ||||
|     ) -> Optional[DownloadToken]: | ||||
|         log.explain_topic(f"Decision: Download {fmt_path(path)}") | ||||
|         path = self._deduplicator.mark(path) | ||||
|         self._output_dir.report.found(path) | ||||
|  | ||||
|         transformed_path = self._transformer.transform(path) | ||||
|         if transformed_path is None: | ||||
| @@ -339,7 +341,7 @@ class Crawler(ABC): | ||||
|             return | ||||
|  | ||||
|         seen: Set[PurePath] = set() | ||||
|         for known in sorted(self.prev_report.known_files): | ||||
|         for known in sorted(self.prev_report.found_paths): | ||||
|             looking_at = list(reversed(known.parents)) + [known] | ||||
|             for path in looking_at: | ||||
|                 if path in seen: | ||||
|   | ||||
| @@ -50,12 +50,22 @@ class Report: | ||||
|     """ | ||||
|  | ||||
|     def __init__(self) -> None: | ||||
|         # Paths found by the crawler, untransformed | ||||
|         self.found_paths: Set[PurePath] = set() | ||||
|  | ||||
|         # Files reserved for metadata files (e. g. the report file or cookies) | ||||
|         # that can't be overwritten by user transforms and won't be cleaned up | ||||
|         # at the end. | ||||
|         self.reserved_files: Set[PurePath] = set() | ||||
|  | ||||
|         # Files found by the crawler, transformed. Only includes files that | ||||
|         # were downloaded (or a download was attempted) | ||||
|         self.known_files: Set[PurePath] = set() | ||||
|  | ||||
|         self.added_files: Set[PurePath] = set() | ||||
|         self.changed_files: Set[PurePath] = set() | ||||
|         self.deleted_files: Set[PurePath] = set() | ||||
|         # Files that should have been deleted by the cleanup but weren't | ||||
|         self.not_deleted_files: Set[PurePath] = set() | ||||
|  | ||||
|     @staticmethod | ||||
| @@ -84,6 +94,8 @@ class Report: | ||||
|             raise ReportLoadError("Incorrect format: Root is not an object") | ||||
|  | ||||
|         self = cls() | ||||
|         for elem in self._get_list_of_strs(data, "found"): | ||||
|             self.found(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "reserved"): | ||||
|             self.mark_reserved(PurePath(elem)) | ||||
|         for elem in self._get_list_of_strs(data, "known"): | ||||
| @@ -105,6 +117,7 @@ class Report: | ||||
|         """ | ||||
|  | ||||
|         data = { | ||||
|             "found": [str(path) for path in sorted(self.found_paths)], | ||||
|             "reserved": [str(path) for path in sorted(self.reserved_files)], | ||||
|             "known": [str(path) for path in sorted(self.known_files)], | ||||
|             "added": [str(path) for path in sorted(self.added_files)], | ||||
| @@ -117,6 +130,9 @@ class Report: | ||||
|             json.dump(data, f, indent=2, sort_keys=True) | ||||
|             f.write("\n")  # json.dump doesn't do this | ||||
|  | ||||
|     def found(self, path: PurePath) -> None: | ||||
|         self.found_paths.add(path) | ||||
|  | ||||
|     def mark_reserved(self, path: PurePath) -> None: | ||||
|         if path in self.marked: | ||||
|             raise RuntimeError("Trying to reserve an already reserved file") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Joscha
					Joscha