mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Deduplicate path names in crawler
Also rename files so they follow the restrictions for windows file names if we're on windows.
This commit is contained in:
parent
c21ddf225b
commit
bce3dc384d
@ -83,6 +83,8 @@ crawlers:
|
|||||||
- `delay_between_tasks`: Time (in seconds) that the crawler should wait between
|
- `delay_between_tasks`: Time (in seconds) that the crawler should wait between
|
||||||
subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
|
subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
|
||||||
load for the crawl target. (Default: 0.0)
|
load for the crawl target. (Default: 0.0)
|
||||||
|
- `windows_paths`: Whether PFERD should find alternative names for paths that
|
||||||
|
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
|
||||||
|
|
||||||
Some crawlers may also require credentials for authentication. To configure how
|
Some crawlers may also require credentials for authentication. To configure how
|
||||||
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
@ -8,6 +9,7 @@ from rich.markup import escape
|
|||||||
|
|
||||||
from ..auth import Authenticator
|
from ..auth import Authenticator
|
||||||
from ..config import Config, Section
|
from ..config import Config, Section
|
||||||
|
from ..deduplicator import Deduplicator
|
||||||
from ..limiter import Limiter
|
from ..limiter import Limiter
|
||||||
from ..logging import ProgressBar, log
|
from ..logging import ProgressBar, log
|
||||||
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
|
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
|
||||||
@ -97,6 +99,10 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
|
|||||||
self._limiter = limiter
|
self._limiter = limiter
|
||||||
self._path = path
|
self._path = path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def path(self) -> PurePath:
|
||||||
|
return self._path
|
||||||
|
|
||||||
async def _on_aenter(self) -> ProgressBar:
|
async def _on_aenter(self) -> ProgressBar:
|
||||||
bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}"
|
bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}"
|
||||||
after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}"
|
after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}"
|
||||||
@ -116,6 +122,10 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
|
|||||||
self._fs_token = fs_token
|
self._fs_token = fs_token
|
||||||
self._path = path
|
self._path = path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def path(self) -> PurePath:
|
||||||
|
return self._path
|
||||||
|
|
||||||
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
|
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
|
||||||
bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}"
|
bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}"
|
||||||
# The "Downloaded ..." message is printed in the output dir, not here
|
# The "Downloaded ..." message is printed in the output dir, not here
|
||||||
@ -195,6 +205,10 @@ class CrawlerSection(Section):
|
|||||||
self.invalid_value("auth", value, "No such auth section exists")
|
self.invalid_value("auth", value, "No such auth section exists")
|
||||||
return auth
|
return auth
|
||||||
|
|
||||||
|
def windows_paths(self) -> bool:
|
||||||
|
on_windows = os.name == "nt"
|
||||||
|
return self.s.getboolean("windows_paths", fallback=on_windows)
|
||||||
|
|
||||||
|
|
||||||
class Crawler(ABC):
|
class Crawler(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -221,12 +235,14 @@ class Crawler(ABC):
|
|||||||
task_delay=section.delay_between_tasks(),
|
task_delay=section.delay_between_tasks(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._deduplicator = Deduplicator(section.windows_paths())
|
||||||
self._transformer = Transformer(section.transform())
|
self._transformer = Transformer(section.transform())
|
||||||
|
|
||||||
self._output_dir = OutputDirectory(
|
self._output_dir = OutputDirectory(
|
||||||
config.default_section.working_dir() / section.output_dir(name),
|
config.default_section.working_dir() / section.output_dir(name),
|
||||||
section.redownload(),
|
section.redownload(),
|
||||||
section.on_conflict(),
|
section.on_conflict(),
|
||||||
|
section.windows_paths(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -258,6 +274,7 @@ class Crawler(ABC):
|
|||||||
|
|
||||||
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
||||||
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
||||||
|
path = self._deduplicator.mark(path)
|
||||||
|
|
||||||
if self._transformer.transform(path) is None:
|
if self._transformer.transform(path) is None:
|
||||||
log.explain("Answer: No")
|
log.explain("Answer: No")
|
||||||
@ -274,6 +291,7 @@ class Crawler(ABC):
|
|||||||
on_conflict: Optional[OnConflict] = None,
|
on_conflict: Optional[OnConflict] = None,
|
||||||
) -> Optional[DownloadToken]:
|
) -> Optional[DownloadToken]:
|
||||||
log.explain_topic(f"Decision: Download {fmt_path(path)}")
|
log.explain_topic(f"Decision: Download {fmt_path(path)}")
|
||||||
|
path = self._deduplicator.mark(path)
|
||||||
|
|
||||||
transformed_path = self._transformer.transform(path)
|
transformed_path = self._transformer.transform(path)
|
||||||
if transformed_path is None:
|
if transformed_path is None:
|
||||||
|
@ -80,7 +80,7 @@ class LocalCrawler(Crawler):
|
|||||||
))
|
))
|
||||||
|
|
||||||
for child in path.iterdir():
|
for child in path.iterdir():
|
||||||
pure_child = pure / child.name
|
pure_child = cl.path / child.name
|
||||||
tasks.append(self._crawl_path(child, pure_child))
|
tasks.append(self._crawl_path(child, pure_child))
|
||||||
|
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
79
PFERD/deduplicator.py
Normal file
79
PFERD/deduplicator.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
from pathlib import PurePath
|
||||||
|
from typing import Iterator, Set
|
||||||
|
|
||||||
|
from .logging import log
|
||||||
|
from .utils import fmt_path
|
||||||
|
|
||||||
|
|
||||||
|
def name_variants(path: PurePath) -> Iterator[PurePath]:
|
||||||
|
separator = " " if " " in path.stem else "_"
|
||||||
|
i = 1
|
||||||
|
while True:
|
||||||
|
yield path.parent / f"{path.stem}{separator}{i}{path.suffix}"
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
class Deduplicator:
|
||||||
|
FORBIDDEN_CHARS = '<>:"/\\|?*'
|
||||||
|
FORBIDDEN_NAMES = {
|
||||||
|
"CON", "PRN", "AUX", "NUL",
|
||||||
|
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
|
||||||
|
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, windows_paths: bool) -> None:
|
||||||
|
self._windows_paths = windows_paths
|
||||||
|
|
||||||
|
self._known: Set[PurePath] = set()
|
||||||
|
|
||||||
|
def _add(self, path: PurePath) -> None:
|
||||||
|
self._known.add(path)
|
||||||
|
|
||||||
|
# The last parent is just "."
|
||||||
|
for parent in list(path.parents)[:-1]:
|
||||||
|
self._known.add(parent)
|
||||||
|
|
||||||
|
def _fixup_element(self, name: str) -> str:
|
||||||
|
# For historical reasons, windows paths have some odd restrictions that
|
||||||
|
# we're trying to avoid. See:
|
||||||
|
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
|
||||||
|
|
||||||
|
for char in self.FORBIDDEN_CHARS:
|
||||||
|
name = name.replace(char, "_")
|
||||||
|
|
||||||
|
path = PurePath(name)
|
||||||
|
if path.stem in self.FORBIDDEN_NAMES:
|
||||||
|
name = f"{path.stem}_{path.suffix}"
|
||||||
|
|
||||||
|
if name.endswith(" ") or name.endswith("."):
|
||||||
|
name += "_"
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
|
def _fixup_for_windows(self, path: PurePath) -> PurePath:
|
||||||
|
new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts])
|
||||||
|
if new_path != path:
|
||||||
|
log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility")
|
||||||
|
return new_path
|
||||||
|
|
||||||
|
def mark(self, path: PurePath) -> PurePath:
|
||||||
|
if self._windows_paths:
|
||||||
|
path = self._fixup_for_windows(path)
|
||||||
|
|
||||||
|
if path not in self._known:
|
||||||
|
self._add(path)
|
||||||
|
return path
|
||||||
|
|
||||||
|
log.explain(f"Path {fmt_path(path)} is already taken, finding a new name")
|
||||||
|
|
||||||
|
for variant in name_variants(path):
|
||||||
|
if variant in self._known:
|
||||||
|
log.explain(f"Path {fmt_path(variant)} is taken as well")
|
||||||
|
continue
|
||||||
|
|
||||||
|
log.explain(f"Found unused path {fmt_path(variant)}")
|
||||||
|
self._add(variant)
|
||||||
|
return variant
|
||||||
|
|
||||||
|
# The "name_variants" iterator returns infinitely many paths
|
||||||
|
raise RuntimeError("Unreachable")
|
@ -142,8 +142,17 @@ class OutputDirectory:
|
|||||||
root: Path,
|
root: Path,
|
||||||
redownload: Redownload,
|
redownload: Redownload,
|
||||||
on_conflict: OnConflict,
|
on_conflict: OnConflict,
|
||||||
|
windows_paths: bool,
|
||||||
):
|
):
|
||||||
|
if windows_paths:
|
||||||
|
# Windows limits the path length to 260 for some historical reason
|
||||||
|
# If you want longer paths, you will have to add the "\\?\" prefix
|
||||||
|
# in front of your path. See:
|
||||||
|
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation
|
||||||
|
self._root = Path("\\\\?\\" + str(root))
|
||||||
|
else:
|
||||||
self._root = root
|
self._root = root
|
||||||
|
|
||||||
self._redownload = redownload
|
self._redownload = redownload
|
||||||
self._on_conflict = on_conflict
|
self._on_conflict = on_conflict
|
||||||
|
|
||||||
@ -181,6 +190,7 @@ class OutputDirectory:
|
|||||||
raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}")
|
raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}")
|
||||||
if "." in path.parts:
|
if "." in path.parts:
|
||||||
raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}")
|
raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}")
|
||||||
|
|
||||||
return self._root / path
|
return self._root / path
|
||||||
|
|
||||||
def _should_download(
|
def _should_download(
|
||||||
|
@ -114,6 +114,9 @@ class Report:
|
|||||||
f.write("\n") # json.dump doesn't do this
|
f.write("\n") # json.dump doesn't do this
|
||||||
|
|
||||||
def mark_reserved(self, path: PurePath) -> None:
|
def mark_reserved(self, path: PurePath) -> None:
|
||||||
|
if path in self.marked:
|
||||||
|
raise RuntimeError("Trying to reserve an already reserved file")
|
||||||
|
|
||||||
self.reserved_files.add(path)
|
self.reserved_files.add(path)
|
||||||
|
|
||||||
def mark(self, path: PurePath) -> None:
|
def mark(self, path: PurePath) -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user