mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add local file crawler
This commit is contained in:
parent
273d56c39a
commit
60cd9873bc
@ -14,6 +14,9 @@ class ProgressBar:
|
|||||||
def advance(self, amount: float = 1) -> None:
|
def advance(self, amount: float = 1) -> None:
|
||||||
self._progress.advance(self._taskid, advance=amount)
|
self._progress.advance(self._taskid, advance=amount)
|
||||||
|
|
||||||
|
def set_total(self, total) -> None:
|
||||||
|
self._progress.update(self._taskid, total=total)
|
||||||
|
|
||||||
|
|
||||||
class TerminalConductor:
|
class TerminalConductor:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
@ -1,16 +1,17 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
||||||
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
|
from typing import (Any, AsyncContextManager, AsyncIterator, Callable,
|
||||||
Callable, Optional, Protocol, TypeVar)
|
Coroutine, Optional, Protocol, TypeVar)
|
||||||
|
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
from .conductor import ProgressBar, TerminalConductor
|
from .conductor import ProgressBar, TerminalConductor
|
||||||
from .config import Config, Section
|
from .config import Config, Section
|
||||||
from .limiter import Limiter
|
from .limiter import Limiter
|
||||||
from .output_dir import OnConflict, OutputDirectory, Redownload
|
from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload
|
||||||
from .transformer import RuleParseException, Transformer
|
from .transformer import RuleParseException, Transformer
|
||||||
|
|
||||||
|
|
||||||
@ -37,7 +38,7 @@ def noncritical(f: Wrapped) -> Wrapped:
|
|||||||
f(self, *args, **kwargs)
|
f(self, *args, **kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print(f"[red]Something went wrong: {escape(str(e))}")
|
self.print(f"[red]Something went wrong: {escape(str(e))}")
|
||||||
self._error_free = False
|
self.error_free = False
|
||||||
return wrapper # type: ignore
|
return wrapper # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@ -61,7 +62,7 @@ class ACrawlerMemberFunction(Protocol):
|
|||||||
__self: "Crawler",
|
__self: "Crawler",
|
||||||
*__args: Any,
|
*__args: Any,
|
||||||
**__kwargs: Any,
|
**__kwargs: Any,
|
||||||
) -> Awaitable[None]:
|
) -> Coroutine[Any, Any, None]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -74,7 +75,7 @@ def anoncritical(f: AWrapped) -> AWrapped:
|
|||||||
await f(self, *args, **kwargs)
|
await f(self, *args, **kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print(f"[red]Something went wrong: {escape(str(e))}")
|
self.print(f"[red]Something went wrong: {escape(str(e))}")
|
||||||
self._error_free = False
|
self.error_free = False
|
||||||
return wrapper # type: ignore
|
return wrapper # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@ -94,7 +95,7 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
|
|||||||
|
|
||||||
class CrawlerSection(Section):
|
class CrawlerSection(Section):
|
||||||
def output_dir(self, name: str) -> Path:
|
def output_dir(self, name: str) -> Path:
|
||||||
return Path(self.s.get("output_dir", name))
|
return Path(self.s.get("output_dir", name)).expanduser()
|
||||||
|
|
||||||
def redownload(self) -> Redownload:
|
def redownload(self) -> Redownload:
|
||||||
value = self.s.get("redownload", "never-smart")
|
value = self.s.get("redownload", "never-smart")
|
||||||
@ -158,7 +159,7 @@ class Crawler(ABC):
|
|||||||
self._conductor,
|
self._conductor,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._error_free = False
|
self.error_free = False
|
||||||
|
|
||||||
def print(self, text: str) -> None:
|
def print(self, text: str) -> None:
|
||||||
"""
|
"""
|
||||||
@ -203,11 +204,24 @@ class Crawler(ABC):
|
|||||||
def download_bar(
|
def download_bar(
|
||||||
self,
|
self,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
size: int,
|
total: Optional[int] = None,
|
||||||
) -> AsyncContextManager[ProgressBar]:
|
) -> AsyncContextManager[ProgressBar]:
|
||||||
pathstr = escape(str(path))
|
pathstr = escape(str(path))
|
||||||
desc = f"[bold green]Downloading[/bold green] {pathstr}"
|
desc = f"[bold green]Downloading[/bold green] {pathstr}"
|
||||||
return self.progress_bar(desc, total=size)
|
return self.progress_bar(desc, total=total)
|
||||||
|
|
||||||
|
async def download(
|
||||||
|
self,
|
||||||
|
path: PurePath,
|
||||||
|
mtime: Optional[datetime] = None,
|
||||||
|
redownload: Optional[Redownload] = None,
|
||||||
|
on_conflict: Optional[OnConflict] = None,
|
||||||
|
) -> Optional[AsyncContextManager[FileSink]]:
|
||||||
|
return await self._output_dir.download(
|
||||||
|
path, mtime, redownload, on_conflict)
|
||||||
|
|
||||||
|
async def cleanup(self) -> None:
|
||||||
|
await self._output_dir.cleanup()
|
||||||
|
|
||||||
async def run(self) -> None:
|
async def run(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -4,7 +4,9 @@ from typing import Callable, Dict
|
|||||||
from ..config import Config
|
from ..config import Config
|
||||||
from ..crawler import Crawler, CrawlerSection
|
from ..crawler import Crawler, CrawlerSection
|
||||||
from .dummy import DummyCrawler
|
from .dummy import DummyCrawler
|
||||||
|
from .local import LocalCrawler, LocalCrawlerSection
|
||||||
|
|
||||||
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
|
CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = {
|
||||||
"dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
|
"dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)),
|
||||||
|
"local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)),
|
||||||
}
|
}
|
||||||
|
63
PFERD/crawlers/local.py
Normal file
63
PFERD/crawlers/local.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import asyncio
|
||||||
|
from pathlib import Path, PurePath
|
||||||
|
|
||||||
|
from ..config import Config
|
||||||
|
from ..crawler import Crawler, CrawlerSection, anoncritical
|
||||||
|
|
||||||
|
|
||||||
|
class LocalCrawlerSection(CrawlerSection):
|
||||||
|
def path(self) -> Path:
|
||||||
|
value = self.s.get("path")
|
||||||
|
if value is None:
|
||||||
|
self.missing_value("path")
|
||||||
|
return Path(value).expanduser()
|
||||||
|
|
||||||
|
|
||||||
|
class LocalCrawler(Crawler):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
config: Config,
|
||||||
|
section: LocalCrawlerSection,
|
||||||
|
):
|
||||||
|
super().__init__(name, config, section)
|
||||||
|
|
||||||
|
self._path = section.path()
|
||||||
|
|
||||||
|
async def crawl(self) -> None:
|
||||||
|
await self._crawl_path(self._path, PurePath())
|
||||||
|
if self.error_free:
|
||||||
|
self.cleanup()
|
||||||
|
|
||||||
|
@anoncritical
|
||||||
|
async def _crawl_path(self, path: Path, pure: PurePath) -> None:
|
||||||
|
if path.is_dir():
|
||||||
|
await self._crawl_dir(path, pure)
|
||||||
|
elif path.is_file():
|
||||||
|
await self._crawl_file(path, pure)
|
||||||
|
|
||||||
|
async def _crawl_dir(self, path: Path, pure: PurePath) -> None:
|
||||||
|
tasks = []
|
||||||
|
async with self.crawl_bar(pure):
|
||||||
|
for child in path.iterdir():
|
||||||
|
pure_child = pure / child.name
|
||||||
|
tasks.append(self._crawl_path(child, pure_child))
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
||||||
|
async with self.download_bar(path) as bar:
|
||||||
|
bar.set_total(path.stat().st_size)
|
||||||
|
|
||||||
|
dl = await self.download(pure)
|
||||||
|
if not dl:
|
||||||
|
return
|
||||||
|
|
||||||
|
async with dl as sink:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
data = f.read(1024**2)
|
||||||
|
if len(data) == 0:
|
||||||
|
break
|
||||||
|
sink.file.write(data)
|
||||||
|
bar.advance(len(data))
|
||||||
|
sink.done()
|
@ -294,7 +294,7 @@ class OutputDirectory:
|
|||||||
info = DownloadInfo(path, local_path, tmp_path,
|
info = DownloadInfo(path, local_path, tmp_path,
|
||||||
heuristics, on_conflict)
|
heuristics, on_conflict)
|
||||||
try:
|
try:
|
||||||
file = open(tmp_path, "bx")
|
file = open(tmp_path, "xb")
|
||||||
return self._sink_context_manager(file, info)
|
return self._sink_context_manager(file, info)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass # Try again
|
pass # Try again
|
||||||
|
@ -26,7 +26,7 @@ class Pferd:
|
|||||||
if crawler_constructor is None:
|
if crawler_constructor is None:
|
||||||
abort = True
|
abort = True
|
||||||
t = escape(repr(crawler_type))
|
t = escape(repr(crawler_type))
|
||||||
print(f"[red]Error: Unknown type {t}")
|
print(f"[red]Error: Unknown crawler type {t}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
crawler = crawler_constructor(name, self._config, section)
|
crawler = crawler_constructor(name, self._config, section)
|
||||||
|
@ -28,6 +28,15 @@ class MarkConflictException(Exception):
|
|||||||
collides_with: PurePath
|
collides_with: PurePath
|
||||||
|
|
||||||
|
|
||||||
|
# TODO Use PurePath.is_relative_to when updating to 3.9
|
||||||
|
def is_relative_to(a: PurePath, b: PurePath) -> bool:
|
||||||
|
try:
|
||||||
|
a.relative_to(b)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class Report:
|
class Report:
|
||||||
"""
|
"""
|
||||||
A report of a synchronization. Includes all files found by the crawler, as
|
A report of a synchronization. Includes all files found by the crawler, as
|
||||||
@ -53,7 +62,7 @@ class Report:
|
|||||||
if path == known_path:
|
if path == known_path:
|
||||||
raise MarkDuplicateException(path)
|
raise MarkDuplicateException(path)
|
||||||
|
|
||||||
if path.relative_to(known_path) or known_path.relative_to(path):
|
if is_relative_to(path, known_path) or is_relative_to(known_path, path):
|
||||||
raise MarkConflictException(path, known_path)
|
raise MarkConflictException(path, known_path)
|
||||||
|
|
||||||
self.known_files.add(path)
|
self.known_files.add(path)
|
||||||
|
Loading…
Reference in New Issue
Block a user