From bce3dc384d82763f0836c5c236b930fb9d8ce75d Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 11:58:01 +0200 Subject: [PATCH] Deduplicate path names in crawler Also rename files so they follow the restrictions for windows file names if we're on windows. --- CONFIG.md | 2 + PFERD/crawl/crawler.py | 18 ++++++++ PFERD/crawl/local_crawler.py | 2 +- PFERD/deduplicator.py | 79 ++++++++++++++++++++++++++++++++++++ PFERD/output_dir.py | 12 +++++- PFERD/report.py | 3 ++ 6 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 PFERD/deduplicator.py diff --git a/CONFIG.md b/CONFIG.md index 7e8a717..982f4fc 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -83,6 +83,8 @@ crawlers: - `delay_between_tasks`: Time (in seconds) that the crawler should wait between subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary load for the crawl target. (Default: 0.0) +- `windows_paths`: Whether PFERD should find alternative names for paths that + are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) Some crawlers may also require credentials for authentication. To configure how the crawler obtains its credentials, the `auth` option is used. It is set to the diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index d6d4abc..8bd29ad 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -1,4 +1,5 @@ import asyncio +import os from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path, PurePath @@ -8,6 +9,7 @@ from rich.markup import escape from ..auth import Authenticator from ..config import Config, Section +from ..deduplicator import Deduplicator from ..limiter import Limiter from ..logging import ProgressBar, log from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload @@ -97,6 +99,10 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]): self._limiter = limiter self._path = path + @property + def path(self) -> PurePath: + return self._path + async def _on_aenter(self) -> ProgressBar: bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}" @@ -116,6 +122,10 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): self._fs_token = fs_token self._path = path + @property + def path(self) -> PurePath: + return self._path + async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}" # The "Downloaded ..." message is printed in the output dir, not here @@ -195,6 +205,10 @@ class CrawlerSection(Section): self.invalid_value("auth", value, "No such auth section exists") return auth + def windows_paths(self) -> bool: + on_windows = os.name == "nt" + return self.s.getboolean("windows_paths", fallback=on_windows) + class Crawler(ABC): def __init__( @@ -221,12 +235,14 @@ class Crawler(ABC): task_delay=section.delay_between_tasks(), ) + self._deduplicator = Deduplicator(section.windows_paths()) self._transformer = Transformer(section.transform()) self._output_dir = OutputDirectory( config.default_section.working_dir() / section.output_dir(name), section.redownload(), section.on_conflict(), + section.windows_paths(), ) @property @@ -258,6 +274,7 @@ class Crawler(ABC): async def crawl(self, path: PurePath) -> Optional[CrawlToken]: log.explain_topic(f"Decision: Crawl {fmt_path(path)}") + path = self._deduplicator.mark(path) if self._transformer.transform(path) is None: log.explain("Answer: No") @@ -274,6 +291,7 @@ class Crawler(ABC): on_conflict: Optional[OnConflict] = None, ) -> Optional[DownloadToken]: log.explain_topic(f"Decision: Download {fmt_path(path)}") + path = self._deduplicator.mark(path) transformed_path = self._transformer.transform(path) if transformed_path is None: diff --git a/PFERD/crawl/local_crawler.py b/PFERD/crawl/local_crawler.py index 7958169..f102bc9 100644 --- a/PFERD/crawl/local_crawler.py +++ b/PFERD/crawl/local_crawler.py @@ -80,7 +80,7 @@ class LocalCrawler(Crawler): )) for child in path.iterdir(): - pure_child = pure / child.name + pure_child = cl.path / child.name tasks.append(self._crawl_path(child, pure_child)) await self.gather(tasks) diff --git a/PFERD/deduplicator.py b/PFERD/deduplicator.py new file mode 100644 index 0000000..ef62dcb --- /dev/null +++ b/PFERD/deduplicator.py @@ -0,0 +1,79 @@ +from pathlib import PurePath +from typing import Iterator, Set + +from .logging import log +from .utils import fmt_path + + +def name_variants(path: PurePath) -> Iterator[PurePath]: + separator = " " if " " in path.stem else "_" + i = 1 + while True: + yield path.parent / f"{path.stem}{separator}{i}{path.suffix}" + i += 1 + + +class Deduplicator: + FORBIDDEN_CHARS = '<>:"/\\|?*' + FORBIDDEN_NAMES = { + "CON", "PRN", "AUX", "NUL", + "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", + "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", + } + + def __init__(self, windows_paths: bool) -> None: + self._windows_paths = windows_paths + + self._known: Set[PurePath] = set() + + def _add(self, path: PurePath) -> None: + self._known.add(path) + + # The last parent is just "." + for parent in list(path.parents)[:-1]: + self._known.add(parent) + + def _fixup_element(self, name: str) -> str: + # For historical reasons, windows paths have some odd restrictions that + # we're trying to avoid. See: + # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file + + for char in self.FORBIDDEN_CHARS: + name = name.replace(char, "_") + + path = PurePath(name) + if path.stem in self.FORBIDDEN_NAMES: + name = f"{path.stem}_{path.suffix}" + + if name.endswith(" ") or name.endswith("."): + name += "_" + + return name + + def _fixup_for_windows(self, path: PurePath) -> PurePath: + new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts]) + if new_path != path: + log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") + return new_path + + def mark(self, path: PurePath) -> PurePath: + if self._windows_paths: + path = self._fixup_for_windows(path) + + if path not in self._known: + self._add(path) + return path + + log.explain(f"Path {fmt_path(path)} is already taken, finding a new name") + + for variant in name_variants(path): + if variant in self._known: + log.explain(f"Path {fmt_path(variant)} is taken as well") + continue + + log.explain(f"Found unused path {fmt_path(variant)}") + self._add(variant) + return variant + + # The "name_variants" iterator returns infinitely many paths + raise RuntimeError("Unreachable") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 17cb772..5f65316 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -142,8 +142,17 @@ class OutputDirectory: root: Path, redownload: Redownload, on_conflict: OnConflict, + windows_paths: bool, ): - self._root = root + if windows_paths: + # Windows limits the path length to 260 for some historical reason + # If you want longer paths, you will have to add the "\\?\" prefix + # in front of your path. See: + # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation + self._root = Path("\\\\?\\" + str(root)) + else: + self._root = root + self._redownload = redownload self._on_conflict = on_conflict @@ -181,6 +190,7 @@ class OutputDirectory: raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") if "." in path.parts: raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") + return self._root / path def _should_download( diff --git a/PFERD/report.py b/PFERD/report.py index 619c621..4f15237 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -114,6 +114,9 @@ class Report: f.write("\n") # json.dump doesn't do this def mark_reserved(self, path: PurePath) -> None: + if path in self.marked: + raise RuntimeError("Trying to reserve an already reserved file") + self.reserved_files.add(path) def mark(self, path: PurePath) -> None: