Deduplicate path names in crawler

Also rename files so they follow the restrictions for windows file names if
we're on windows.
This commit is contained in:
Joscha 2021-05-25 11:58:01 +02:00
parent c21ddf225b
commit bce3dc384d
6 changed files with 114 additions and 2 deletions

View File

@ -83,6 +83,8 @@ crawlers:
- `delay_between_tasks`: Time (in seconds) that the crawler should wait between - `delay_between_tasks`: Time (in seconds) that the crawler should wait between
subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
load for the crawl target. (Default: 0.0) load for the crawl target. (Default: 0.0)
- `windows_paths`: Whether PFERD should find alternative names for paths that
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
Some crawlers may also require credentials for authentication. To configure how Some crawlers may also require credentials for authentication. To configure how
the crawler obtains its credentials, the `auth` option is used. It is set to the the crawler obtains its credentials, the `auth` option is used. It is set to the

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from datetime import datetime from datetime import datetime
from pathlib import Path, PurePath from pathlib import Path, PurePath
@ -8,6 +9,7 @@ from rich.markup import escape
from ..auth import Authenticator from ..auth import Authenticator
from ..config import Config, Section from ..config import Config, Section
from ..deduplicator import Deduplicator
from ..limiter import Limiter from ..limiter import Limiter
from ..logging import ProgressBar, log from ..logging import ProgressBar, log
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
@ -97,6 +99,10 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
self._limiter = limiter self._limiter = limiter
self._path = path self._path = path
@property
def path(self) -> PurePath:
return self._path
async def _on_aenter(self) -> ProgressBar: async def _on_aenter(self) -> ProgressBar:
bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}"
after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}" after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}"
@ -116,6 +122,10 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
self._fs_token = fs_token self._fs_token = fs_token
self._path = path self._path = path
@property
def path(self) -> PurePath:
return self._path
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}" bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}"
# The "Downloaded ..." message is printed in the output dir, not here # The "Downloaded ..." message is printed in the output dir, not here
@ -195,6 +205,10 @@ class CrawlerSection(Section):
self.invalid_value("auth", value, "No such auth section exists") self.invalid_value("auth", value, "No such auth section exists")
return auth return auth
def windows_paths(self) -> bool:
on_windows = os.name == "nt"
return self.s.getboolean("windows_paths", fallback=on_windows)
class Crawler(ABC): class Crawler(ABC):
def __init__( def __init__(
@ -221,12 +235,14 @@ class Crawler(ABC):
task_delay=section.delay_between_tasks(), task_delay=section.delay_between_tasks(),
) )
self._deduplicator = Deduplicator(section.windows_paths())
self._transformer = Transformer(section.transform()) self._transformer = Transformer(section.transform())
self._output_dir = OutputDirectory( self._output_dir = OutputDirectory(
config.default_section.working_dir() / section.output_dir(name), config.default_section.working_dir() / section.output_dir(name),
section.redownload(), section.redownload(),
section.on_conflict(), section.on_conflict(),
section.windows_paths(),
) )
@property @property
@ -258,6 +274,7 @@ class Crawler(ABC):
async def crawl(self, path: PurePath) -> Optional[CrawlToken]: async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
log.explain_topic(f"Decision: Crawl {fmt_path(path)}") log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
path = self._deduplicator.mark(path)
if self._transformer.transform(path) is None: if self._transformer.transform(path) is None:
log.explain("Answer: No") log.explain("Answer: No")
@ -274,6 +291,7 @@ class Crawler(ABC):
on_conflict: Optional[OnConflict] = None, on_conflict: Optional[OnConflict] = None,
) -> Optional[DownloadToken]: ) -> Optional[DownloadToken]:
log.explain_topic(f"Decision: Download {fmt_path(path)}") log.explain_topic(f"Decision: Download {fmt_path(path)}")
path = self._deduplicator.mark(path)
transformed_path = self._transformer.transform(path) transformed_path = self._transformer.transform(path)
if transformed_path is None: if transformed_path is None:

View File

@ -80,7 +80,7 @@ class LocalCrawler(Crawler):
)) ))
for child in path.iterdir(): for child in path.iterdir():
pure_child = pure / child.name pure_child = cl.path / child.name
tasks.append(self._crawl_path(child, pure_child)) tasks.append(self._crawl_path(child, pure_child))
await self.gather(tasks) await self.gather(tasks)

79
PFERD/deduplicator.py Normal file
View File

@ -0,0 +1,79 @@
from pathlib import PurePath
from typing import Iterator, Set
from .logging import log
from .utils import fmt_path
def name_variants(path: PurePath) -> Iterator[PurePath]:
separator = " " if " " in path.stem else "_"
i = 1
while True:
yield path.parent / f"{path.stem}{separator}{i}{path.suffix}"
i += 1
class Deduplicator:
FORBIDDEN_CHARS = '<>:"/\\|?*'
FORBIDDEN_NAMES = {
"CON", "PRN", "AUX", "NUL",
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
}
def __init__(self, windows_paths: bool) -> None:
self._windows_paths = windows_paths
self._known: Set[PurePath] = set()
def _add(self, path: PurePath) -> None:
self._known.add(path)
# The last parent is just "."
for parent in list(path.parents)[:-1]:
self._known.add(parent)
def _fixup_element(self, name: str) -> str:
# For historical reasons, windows paths have some odd restrictions that
# we're trying to avoid. See:
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
for char in self.FORBIDDEN_CHARS:
name = name.replace(char, "_")
path = PurePath(name)
if path.stem in self.FORBIDDEN_NAMES:
name = f"{path.stem}_{path.suffix}"
if name.endswith(" ") or name.endswith("."):
name += "_"
return name
def _fixup_for_windows(self, path: PurePath) -> PurePath:
new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts])
if new_path != path:
log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility")
return new_path
def mark(self, path: PurePath) -> PurePath:
if self._windows_paths:
path = self._fixup_for_windows(path)
if path not in self._known:
self._add(path)
return path
log.explain(f"Path {fmt_path(path)} is already taken, finding a new name")
for variant in name_variants(path):
if variant in self._known:
log.explain(f"Path {fmt_path(variant)} is taken as well")
continue
log.explain(f"Found unused path {fmt_path(variant)}")
self._add(variant)
return variant
# The "name_variants" iterator returns infinitely many paths
raise RuntimeError("Unreachable")

View File

@ -142,8 +142,17 @@ class OutputDirectory:
root: Path, root: Path,
redownload: Redownload, redownload: Redownload,
on_conflict: OnConflict, on_conflict: OnConflict,
windows_paths: bool,
): ):
self._root = root if windows_paths:
# Windows limits the path length to 260 for some historical reason
# If you want longer paths, you will have to add the "\\?\" prefix
# in front of your path. See:
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation
self._root = Path("\\\\?\\" + str(root))
else:
self._root = root
self._redownload = redownload self._redownload = redownload
self._on_conflict = on_conflict self._on_conflict = on_conflict
@ -181,6 +190,7 @@ class OutputDirectory:
raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}")
if "." in path.parts: if "." in path.parts:
raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}")
return self._root / path return self._root / path
def _should_download( def _should_download(

View File

@ -114,6 +114,9 @@ class Report:
f.write("\n") # json.dump doesn't do this f.write("\n") # json.dump doesn't do this
def mark_reserved(self, path: PurePath) -> None: def mark_reserved(self, path: PurePath) -> None:
if path in self.marked:
raise RuntimeError("Trying to reserve an already reserved file")
self.reserved_files.add(path) self.reserved_files.add(path)
def mark(self, path: PurePath) -> None: def mark(self, path: PurePath) -> None: