From bce3dc384d82763f0836c5c236b930fb9d8ce75d Mon Sep 17 00:00:00 2001
From: Joscha <joscha@plugh.de>
Date: Tue, 25 May 2021 11:58:01 +0200
Subject: [PATCH] Deduplicate path names in crawler

Also rename files so they follow the restrictions for windows file names if
we're on windows.
---
 CONFIG.md                    |  2 +
 PFERD/crawl/crawler.py       | 18 ++++++++
 PFERD/crawl/local_crawler.py |  2 +-
 PFERD/deduplicator.py        | 79 ++++++++++++++++++++++++++++++++++++
 PFERD/output_dir.py          | 12 +++++-
 PFERD/report.py              |  3 ++
 6 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 PFERD/deduplicator.py

diff --git a/CONFIG.md b/CONFIG.md
index 7e8a717..982f4fc 100644
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -83,6 +83,8 @@ crawlers:
 - `delay_between_tasks`: Time (in seconds) that the crawler should wait between
   subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
   load for the crawl target. (Default: 0.0)
+- `windows_paths`: Whether PFERD should find alternative names for paths that
+  are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
 
 Some crawlers may also require credentials for authentication. To configure how
 the crawler obtains its credentials, the `auth` option is used. It is set to the
diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py
index d6d4abc..8bd29ad 100644
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 from abc import ABC, abstractmethod
 from datetime import datetime
 from pathlib import Path, PurePath
@@ -8,6 +9,7 @@ from rich.markup import escape
 
 from ..auth import Authenticator
 from ..config import Config, Section
+from ..deduplicator import Deduplicator
 from ..limiter import Limiter
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
@@ -97,6 +99,10 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
         self._limiter = limiter
         self._path = path
 
+    @property
+    def path(self) -> PurePath:
+        return self._path
+
     async def _on_aenter(self) -> ProgressBar:
         bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}"
         after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}"
@@ -116,6 +122,10 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
         self._fs_token = fs_token
         self._path = path
 
+    @property
+    def path(self) -> PurePath:
+        return self._path
+
     async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
         bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}"
         # The "Downloaded ..." message is printed in the output dir, not here
@@ -195,6 +205,10 @@ class CrawlerSection(Section):
             self.invalid_value("auth", value, "No such auth section exists")
         return auth
 
+    def windows_paths(self) -> bool:
+        on_windows = os.name == "nt"
+        return self.s.getboolean("windows_paths", fallback=on_windows)
+
 
 class Crawler(ABC):
     def __init__(
@@ -221,12 +235,14 @@ class Crawler(ABC):
             task_delay=section.delay_between_tasks(),
         )
 
+        self._deduplicator = Deduplicator(section.windows_paths())
         self._transformer = Transformer(section.transform())
 
         self._output_dir = OutputDirectory(
             config.default_section.working_dir() / section.output_dir(name),
             section.redownload(),
             section.on_conflict(),
+            section.windows_paths(),
         )
 
     @property
@@ -258,6 +274,7 @@ class Crawler(ABC):
 
     async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
         log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
+        path = self._deduplicator.mark(path)
 
         if self._transformer.transform(path) is None:
             log.explain("Answer: No")
@@ -274,6 +291,7 @@ class Crawler(ABC):
             on_conflict: Optional[OnConflict] = None,
     ) -> Optional[DownloadToken]:
         log.explain_topic(f"Decision: Download {fmt_path(path)}")
+        path = self._deduplicator.mark(path)
 
         transformed_path = self._transformer.transform(path)
         if transformed_path is None:
diff --git a/PFERD/crawl/local_crawler.py b/PFERD/crawl/local_crawler.py
index 7958169..f102bc9 100644
--- a/PFERD/crawl/local_crawler.py
+++ b/PFERD/crawl/local_crawler.py
@@ -80,7 +80,7 @@ class LocalCrawler(Crawler):
             ))
 
             for child in path.iterdir():
-                pure_child = pure / child.name
+                pure_child = cl.path / child.name
                 tasks.append(self._crawl_path(child, pure_child))
 
         await self.gather(tasks)
diff --git a/PFERD/deduplicator.py b/PFERD/deduplicator.py
new file mode 100644
index 0000000..ef62dcb
--- /dev/null
+++ b/PFERD/deduplicator.py
@@ -0,0 +1,79 @@
+from pathlib import PurePath
+from typing import Iterator, Set
+
+from .logging import log
+from .utils import fmt_path
+
+
+def name_variants(path: PurePath) -> Iterator[PurePath]:
+    separator = " " if " " in path.stem else "_"
+    i = 1
+    while True:
+        yield path.parent / f"{path.stem}{separator}{i}{path.suffix}"
+        i += 1
+
+
+class Deduplicator:
+    FORBIDDEN_CHARS = '<>:"/\\|?*'
+    FORBIDDEN_NAMES = {
+        "CON", "PRN", "AUX", "NUL",
+        "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
+        "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
+    }
+
+    def __init__(self, windows_paths: bool) -> None:
+        self._windows_paths = windows_paths
+
+        self._known: Set[PurePath] = set()
+
+    def _add(self, path: PurePath) -> None:
+        self._known.add(path)
+
+        # The last parent is just "."
+        for parent in list(path.parents)[:-1]:
+            self._known.add(parent)
+
+    def _fixup_element(self, name: str) -> str:
+        # For historical reasons, windows paths have some odd restrictions that
+        # we're trying to avoid. See:
+        # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
+
+        for char in self.FORBIDDEN_CHARS:
+            name = name.replace(char, "_")
+
+        path = PurePath(name)
+        if path.stem in self.FORBIDDEN_NAMES:
+            name = f"{path.stem}_{path.suffix}"
+
+        if name.endswith(" ") or name.endswith("."):
+            name += "_"
+
+        return name
+
+    def _fixup_for_windows(self, path: PurePath) -> PurePath:
+        new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts])
+        if new_path != path:
+            log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility")
+        return new_path
+
+    def mark(self, path: PurePath) -> PurePath:
+        if self._windows_paths:
+            path = self._fixup_for_windows(path)
+
+        if path not in self._known:
+            self._add(path)
+            return path
+
+        log.explain(f"Path {fmt_path(path)} is already taken, finding a new name")
+
+        for variant in name_variants(path):
+            if variant in self._known:
+                log.explain(f"Path {fmt_path(variant)} is taken as well")
+                continue
+
+            log.explain(f"Found unused path {fmt_path(variant)}")
+            self._add(variant)
+            return variant
+
+        # The "name_variants" iterator returns infinitely many paths
+        raise RuntimeError("Unreachable")
diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py
index 17cb772..5f65316 100644
--- a/PFERD/output_dir.py
+++ b/PFERD/output_dir.py
@@ -142,8 +142,17 @@ class OutputDirectory:
             root: Path,
             redownload: Redownload,
             on_conflict: OnConflict,
+            windows_paths: bool,
     ):
-        self._root = root
+        if windows_paths:
+            # Windows limits the path length to 260 for some historical reason
+            # If you want longer paths, you will have to add the "\\?\" prefix
+            # in front of your path. See:
+            # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation
+            self._root = Path("\\\\?\\" + str(root))
+        else:
+            self._root = root
+
         self._redownload = redownload
         self._on_conflict = on_conflict
 
@@ -181,6 +190,7 @@ class OutputDirectory:
             raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}")
         if "." in path.parts:
             raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}")
+
         return self._root / path
 
     def _should_download(
diff --git a/PFERD/report.py b/PFERD/report.py
index 619c621..4f15237 100644
--- a/PFERD/report.py
+++ b/PFERD/report.py
@@ -114,6 +114,9 @@ class Report:
             f.write("\n")  # json.dump doesn't do this
 
     def mark_reserved(self, path: PurePath) -> None:
+        if path in self.marked:
+            raise RuntimeError("Trying to reserve an already reserved file")
+
         self.reserved_files.add(path)
 
     def mark(self, path: PurePath) -> None: