mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Unify crawling and downloading steps
Now, the progress bar, limiter etc. for downloading and crawling are all handled via the reusable CrawlToken and DownloadToken context managers.
This commit is contained in:
parent
098ac45758
commit
ec95dda18f
@ -1,10 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar
|
||||||
from typing import Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Dict, Optional, TypeVar
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
@ -13,9 +11,10 @@ from .authenticator import Authenticator
|
|||||||
from .config import Config, Section
|
from .config import Config, Section
|
||||||
from .limiter import Limiter
|
from .limiter import Limiter
|
||||||
from .logging import ProgressBar, log
|
from .logging import ProgressBar, log
|
||||||
from .output_dir import FileSink, OnConflict, OutputDirectory, OutputDirError, Redownload
|
from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
|
||||||
from .report import MarkConflictError, MarkDuplicateError
|
from .report import MarkConflictError, MarkDuplicateError
|
||||||
from .transformer import Transformer
|
from .transformer import Transformer
|
||||||
|
from .utils import ReusableAsyncContextManager
|
||||||
from .version import NAME, VERSION
|
from .version import NAME, VERSION
|
||||||
|
|
||||||
|
|
||||||
@ -88,6 +87,36 @@ def anoncritical(f: AWrapped) -> AWrapped:
|
|||||||
return wrapper # type: ignore
|
return wrapper # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
|
||||||
|
def __init__(self, limiter: Limiter, desc: str):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._limiter = limiter
|
||||||
|
self._desc = desc
|
||||||
|
|
||||||
|
async def _on_aenter(self) -> ProgressBar:
|
||||||
|
await self._stack.enter_async_context(self._limiter.limit_crawl())
|
||||||
|
bar = self._stack.enter_context(log.crawl_bar(self._desc))
|
||||||
|
|
||||||
|
return bar
|
||||||
|
|
||||||
|
|
||||||
|
class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
|
||||||
|
def __init__(self, limiter: Limiter, fs_token: FileSinkToken, desc: str):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._limiter = limiter
|
||||||
|
self._fs_token = fs_token
|
||||||
|
self._desc = desc
|
||||||
|
|
||||||
|
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
|
||||||
|
await self._stack.enter_async_context(self._limiter.limit_crawl())
|
||||||
|
sink = await self._stack.enter_async_context(self._fs_token)
|
||||||
|
bar = self._stack.enter_context(log.crawl_bar(self._desc))
|
||||||
|
|
||||||
|
return bar, sink
|
||||||
|
|
||||||
|
|
||||||
class CrawlerSection(Section):
|
class CrawlerSection(Section):
|
||||||
def output_dir(self, name: str) -> Path:
|
def output_dir(self, name: str) -> Path:
|
||||||
# TODO Use removeprefix() after switching to 3.9
|
# TODO Use removeprefix() after switching to 3.9
|
||||||
@ -190,30 +219,12 @@ class Crawler(ABC):
|
|||||||
section.on_conflict(),
|
section.on_conflict(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@asynccontextmanager
|
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
||||||
async def crawl_bar(
|
if self._transformer.transform(path) is None:
|
||||||
self,
|
return None
|
||||||
path: PurePath,
|
|
||||||
total: Optional[int] = None,
|
|
||||||
) -> AsyncIterator[ProgressBar]:
|
|
||||||
desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
|
desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
|
||||||
async with self._limiter.limit_crawl():
|
return CrawlToken(self._limiter, desc)
|
||||||
with log.crawl_bar(desc, total=total) as bar:
|
|
||||||
yield bar
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def download_bar(
|
|
||||||
self,
|
|
||||||
path: PurePath,
|
|
||||||
total: Optional[int] = None,
|
|
||||||
) -> AsyncIterator[ProgressBar]:
|
|
||||||
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
|
|
||||||
async with self._limiter.limit_download():
|
|
||||||
with log.download_bar(desc, total=total) as bar:
|
|
||||||
yield bar
|
|
||||||
|
|
||||||
def should_crawl(self, path: PurePath) -> bool:
|
|
||||||
return self._transformer.transform(path) is not None
|
|
||||||
|
|
||||||
async def download(
|
async def download(
|
||||||
self,
|
self,
|
||||||
@ -221,13 +232,17 @@ class Crawler(ABC):
|
|||||||
mtime: Optional[datetime] = None,
|
mtime: Optional[datetime] = None,
|
||||||
redownload: Optional[Redownload] = None,
|
redownload: Optional[Redownload] = None,
|
||||||
on_conflict: Optional[OnConflict] = None,
|
on_conflict: Optional[OnConflict] = None,
|
||||||
) -> Optional[AsyncContextManager[FileSink]]:
|
) -> Optional[DownloadToken]:
|
||||||
transformed_path = self._transformer.transform(path)
|
transformed_path = self._transformer.transform(path)
|
||||||
if transformed_path is None:
|
if transformed_path is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return await self._output_dir.download(
|
fs_token = await self._output_dir.download(transformed_path, mtime, redownload, on_conflict)
|
||||||
transformed_path, mtime, redownload, on_conflict)
|
if fs_token is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
|
||||||
|
return DownloadToken(self._limiter, fs_token, desc)
|
||||||
|
|
||||||
async def cleanup(self) -> None:
|
async def cleanup(self) -> None:
|
||||||
await self._output_dir.cleanup()
|
await self._output_dir.cleanup()
|
||||||
@ -239,10 +254,10 @@ class Crawler(ABC):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
with log.show_progress():
|
with log.show_progress():
|
||||||
await self.crawl()
|
await self._run()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def crawl(self) -> None:
|
async def _run(self) -> None:
|
||||||
"""
|
"""
|
||||||
Overwrite this function if you are writing a crawler.
|
Overwrite this function if you are writing a crawler.
|
||||||
|
|
||||||
|
@ -8,8 +8,7 @@ from dataclasses import dataclass
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
from typing import BinaryIO, Iterator, Optional, Tuple
|
||||||
from typing import AsyncContextManager, BinaryIO, Iterator, Optional, Tuple
|
|
||||||
|
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
@ -307,7 +306,7 @@ class OutputDirectory:
|
|||||||
mtime: Optional[datetime] = None,
|
mtime: Optional[datetime] = None,
|
||||||
redownload: Optional[Redownload] = None,
|
redownload: Optional[Redownload] = None,
|
||||||
on_conflict: Optional[OnConflict] = None,
|
on_conflict: Optional[OnConflict] = None,
|
||||||
) -> Optional[AsyncContextManager[FileSink]]:
|
) -> Optional[FileSinkToken]:
|
||||||
"""
|
"""
|
||||||
May throw an OutputDirError, a MarkDuplicateError or a
|
May throw an OutputDirError, a MarkDuplicateError or a
|
||||||
MarkConflictError.
|
MarkConflictError.
|
||||||
|
Loading…
Reference in New Issue
Block a user