pferd/PFERD/crawler.py

from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from datetime import datetime
from pathlib import Path, PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,
                    Callable, Dict, Optional, TypeVar)

import aiohttp
from rich.markup import escape

from .authenticator import Authenticator
from .conductor import ProgressBar, TerminalConductor
from .config import Config, Section
from .limiter import Limiter
from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload
from .transformer import RuleParseException, Transformer
from .version import __version__


class CrawlerLoadException(Exception):
    pass


Wrapped = TypeVar("Wrapped", bound=Callable[..., None])


def noncritical(f: Wrapped) -> Wrapped:
    """
    Warning: Must only be applied to member functions of the Crawler class!

    Catches all exceptions occuring during the function call. If an exception
    occurs, the crawler's error_free variable is set to False.
    """

    def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
        try:
            f(self, *args, **kwargs)
        except Exception as e:
            self.print(f"[red]Something went wrong: {escape(str(e))}")
            self.error_free = False
    return wrapper  # type: ignore


def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]:
    """
    Warning: Must only be applied to member functions of the Crawler class!

    If an exception occurs during the function call, retries the function call
    a set amount of times. Exceptions that occur during the last attempt are
    not caught and instead passed on upwards.
    """

    def decorator(f: Wrapped) -> Wrapped:
        def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
            for _ in range(attempts - 1):
                try:
                    f(self, *args, **kwargs)
                    return
                except Exception:
                    pass
            f(self, *args, **kwargs)
        return wrapper  # type: ignore
    return decorator


AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])


def anoncritical(f: AWrapped) -> AWrapped:
    """
    An async version of @noncritical.
    Warning: Must only be applied to member functions of the Crawler class!

    Catches all exceptions occuring during the function call. If an exception
    occurs, the crawler's error_free variable is set to False.
    """

    async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
        try:
            await f(self, *args, **kwargs)
        except Exception as e:
            self.print(f"[red]Something went wrong: {escape(str(e))}")
            self.error_free = False
    return wrapper  # type: ignore


def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
    """
    An async version of @noncritical.
    Warning: Must only be applied to member functions of the Crawler class!

    If an exception occurs during the function call, retries the function call
    a set amount of times. Exceptions that occur during the last attempt are
    not caught and instead passed on upwards.
    """

    def decorator(f: AWrapped) -> AWrapped:
        async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None:
            for _ in range(attempts - 1):
                try:
                    await f(self, *args, **kwargs)
                    return
                except Exception:
                    pass
            await f(self, *args, **kwargs)
        return wrapper  # type: ignore
    return decorator


class CrawlerSection(Section):
    def output_dir(self, name: str) -> Path:
        return Path(self.s.get("output_dir", name)).expanduser()

    def redownload(self) -> Redownload:
        value = self.s.get("redownload", "never-smart")
        if value == "never":
            return Redownload.NEVER
        elif value == "never-smart":
            return Redownload.NEVER_SMART
        elif value == "always":
            return Redownload.ALWAYS
        elif value == "always-smart":
            return Redownload.ALWAYS_SMART

        self.invalid_value(
            "redownload",
            value,
            "Must be 'never', 'never-smart', 'always' or 'always-smart'"
        )

    def on_conflict(self) -> OnConflict:
        value = self.s.get("on_conflict", "prompt")
        if value == "prompt":
            return OnConflict.PROMPT
        elif value == "local-first":
            return OnConflict.LOCAL_FIRST
        elif value == "remote-first":
            return OnConflict.REMOTE_FIRST
        elif value == "no-delete":
            return OnConflict.NO_DELETE

        self.invalid_value(
            "on_conflict",
            value,
            "Must be 'prompt', 'local-first', 'remote-first' or 'no-delete'",
        )

    def transform(self) -> str:
        return self.s.get("transform", "")

    def max_concurrent_crawls(self) -> int:
        value = self.s.getint("max_concurrent_crawls", fallback=1)
        if value <= 0:
            self.invalid_value("max_concurrent_crawls", value,
                               "Must be greater than 0")
        return value

    def max_concurrent_downloads(self) -> int:
        value = self.s.getint("max_concurrent_downloads", fallback=1)

        if value <= 0:
            self.invalid_value("max_concurrent_downloads", value,
                               "Must be greater than 0")
        return value

    def request_delay(self) -> float:
        value = self.s.getfloat("request_delay", fallback=0.0)
        if value < 0:
            self.invalid_value("request_delay", value,
                               "Must be greater than or equal to 0")
        return value

    def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
        value = self.s.get("auth")
        if value is None:
            self.missing_value("auth")
        auth = authenticators.get(f"auth:{value}")
        if auth is None:
            self.invalid_value("auth", value, "No such auth section exists")
        return auth


class Crawler(ABC):
    def __init__(
            self,
            name: str,
            section: CrawlerSection,
            config: Config,
            conductor: TerminalConductor,
    ) -> None:
        """
        Initialize a crawler from its name and its section in the config file.

        If you are writing your own constructor for your own crawler, make sure
        to call this constructor first (via super().__init__).

        May throw a CrawlerLoadException.
        """

        self.name = name
        self._conductor = conductor
        self.error_free = True

        self._limiter = Limiter(
            crawl_limit=section.max_concurrent_crawls(),
            download_limit=section.max_concurrent_downloads(),
            delay=section.request_delay(),
        )

        try:
            self._transformer = Transformer(section.transform())
        except RuleParseException as e:
            e.pretty_print()
            raise CrawlerLoadException()

        self._output_dir = OutputDirectory(
            config.working_dir / section.output_dir(name),
            section.redownload(),
            section.on_conflict(),
            self._conductor,
        )

    def print(self, text: str) -> None:
        """
        Print rich markup to the terminal. Crawlers *must* use this function to
        print things unless they are holding an exclusive output context
        manager! Be careful to escape all user-supplied strings.
        """

        self._conductor.print(text)

    def exclusive_output(self) -> AsyncContextManager[None]:
        """
        Acquire exclusive rights™ to the terminal output. While this context
        manager is held, output such as printing and progress bars from other
        threads is suspended and the current thread may do whatever it wants
        with the terminal. However, it must return the terminal to its original
        state before exiting the context manager.

        No two threads can hold this context manager at the same time.

        Useful for password or confirmation prompts as well as running other
        programs while crawling (e. g. to get certain credentials).
        """

        return self._conductor.exclusive_output()

    @asynccontextmanager
    async def crawl_bar(
            self,
            path: PurePath,
            total: Optional[int] = None,
    ) -> AsyncIterator[ProgressBar]:
        desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
        async with self._limiter.limit_crawl():
            with self._conductor.progress_bar(desc, total=total) as bar:
                yield bar

    @asynccontextmanager
    async def download_bar(
            self,
            path: PurePath,
            total: Optional[int] = None,
    ) -> AsyncIterator[ProgressBar]:
        desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
        async with self._limiter.limit_download():
            with self._conductor.progress_bar(desc, total=total) as bar:
                yield bar

    async def download(
            self,
            path: PurePath,
            mtime: Optional[datetime] = None,
            redownload: Optional[Redownload] = None,
            on_conflict: Optional[OnConflict] = None,
    ) -> Optional[AsyncContextManager[FileSink]]:
        return await self._output_dir.download(
            path, mtime, redownload, on_conflict)

    async def cleanup(self) -> None:
        await self._output_dir.cleanup()

    async def run(self) -> None:
        """
        Start the crawling process. Call this function if you want to use a
        crawler.
        """

        async with self._conductor:
            await self.crawl()

    @abstractmethod
    async def crawl(self) -> None:
        """
        Overwrite this function if you are writing a crawler.

        This function must not return before all crawling is complete. To crawl
        multiple things concurrently, asyncio.gather can be used.
        """

        pass


class HttpCrawler(Crawler):
    COOKIE_FILE = PurePath(".cookies")

    def __init__(
            self,
            name: str,
            section: CrawlerSection,
            config: Config,
            conductor: TerminalConductor,
    ) -> None:
        super().__init__(name, section, config, conductor)

        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._output_dir.register_reserved(self.COOKIE_FILE)

    async def run(self) -> None:
        cookie_jar = aiohttp.CookieJar()

        try:
            cookie_jar.load(self._cookie_jar_path)
        except Exception:
            pass

        async with aiohttp.ClientSession(
                headers={"User-Agent": f"pferd/{__version__}"},
                cookie_jar=cookie_jar,
        ) as session:
            self.session = session
            try:
                await super().run()
            finally:
                del self.session

        try:
            cookie_jar.save(self._cookie_jar_path)
        except Exception:
            self.print(
                "[bold red]Warning:[/] Failed to save cookies to "
                + escape(str(self.COOKIE_FILE))
            )
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`from abc import ABC, abstractmethod`
			`from contextlib import asynccontextmanager`
Add local file crawler 2021-05-06 01:02:40 +02:00			`from datetime import datetime`
Properly load crawler config 2021-05-05 23:45:10 +02:00			`from pathlib import Path, PurePath`
Make progress bars easier to use The crawler now supports two types of progress bars 2021-04-29 13:53:16 +02:00			`# TODO In Python 3.9 and above, AsyncContextManager is deprecated`
Fix mymy errors 2021-05-09 01:45:01 +02:00			`from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable,`
Let crawlers obtain authenticators 2021-05-13 18:57:20 +02:00			`Callable, Dict, Optional, TypeVar)`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00
Add HttpCrawler 2021-05-13 22:28:14 +02:00			`import aiohttp`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`from rich.markup import escape`

Let crawlers obtain authenticators 2021-05-13 18:57:20 +02:00			`from .authenticator import Authenticator`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`from .conductor import ProgressBar, TerminalConductor`
Properly load crawler config 2021-05-05 23:45:10 +02:00			`from .config import Config, Section`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`from .limiter import Limiter`
Add local file crawler 2021-05-06 01:02:40 +02:00			`from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`from .transformer import RuleParseException, Transformer`
Set user agent to "pferd/<version>" 2021-05-14 00:09:58 +02:00			`from .version import __version__`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00

			`class CrawlerLoadException(Exception):`
			`pass`


Fix mymy errors 2021-05-09 01:45:01 +02:00			`Wrapped = TypeVar("Wrapped", bound=Callable[..., None])`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00

Fix mymy errors 2021-05-09 01:45:01 +02:00			`def noncritical(f: Wrapped) -> Wrapped:`
			`"""`
			`Warning: Must only be applied to member functions of the Crawler class!`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00
Fix mymy errors 2021-05-09 01:45:01 +02:00			`Catches all exceptions occuring during the function call. If an exception`
			`occurs, the crawler's error_free variable is set to False.`
			`"""`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00
			`def wrapper(self: "Crawler", args: Any, *kwargs: Any) -> None:`
			`try:`
			`f(self, args, *kwargs)`
			`except Exception as e:`
			`self.print(f"[red]Something went wrong: {escape(str(e))}")`
Add local file crawler 2021-05-06 01:02:40 +02:00			`self.error_free = False`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00			`return wrapper # type: ignore`


			`def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]:`
Fix mymy errors 2021-05-09 01:45:01 +02:00			`"""`
			`Warning: Must only be applied to member functions of the Crawler class!`

			`If an exception occurs during the function call, retries the function call`
			`a set amount of times. Exceptions that occur during the last attempt are`
			`not caught and instead passed on upwards.`
			`"""`

Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00			`def decorator(f: Wrapped) -> Wrapped:`
			`def wrapper(self: "Crawler", args: Any, *kwargs: Any) -> None:`
			`for _ in range(attempts - 1):`
			`try:`
			`f(self, args, *kwargs)`
			`return`
			`except Exception:`
			`pass`
			`f(self, args, *kwargs)`
			`return wrapper # type: ignore`
			`return decorator`


Fix mymy errors 2021-05-09 01:45:01 +02:00			`AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00

Fix mymy errors 2021-05-09 01:45:01 +02:00			`def anoncritical(f: AWrapped) -> AWrapped:`
			`"""`
			`An async version of @noncritical.`
			`Warning: Must only be applied to member functions of the Crawler class!`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00
Fix mymy errors 2021-05-09 01:45:01 +02:00			`Catches all exceptions occuring during the function call. If an exception`
			`occurs, the crawler's error_free variable is set to False.`
			`"""`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00
			`async def wrapper(self: "Crawler", args: Any, *kwargs: Any) -> None:`
			`try:`
			`await f(self, args, *kwargs)`
			`except Exception as e:`
			`self.print(f"[red]Something went wrong: {escape(str(e))}")`
Add local file crawler 2021-05-06 01:02:40 +02:00			`self.error_free = False`
Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00			`return wrapper # type: ignore`


			`def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:`
Fix mymy errors 2021-05-09 01:45:01 +02:00			`"""`
			`An async version of @noncritical.`
			`Warning: Must only be applied to member functions of the Crawler class!`

			`If an exception occurs during the function call, retries the function call`
			`a set amount of times. Exceptions that occur during the last attempt are`
			`not caught and instead passed on upwards.`
			`"""`

Add @noncritical and @repeat decorators 2021-05-05 23:36:54 +02:00			`def decorator(f: AWrapped) -> AWrapped:`
			`async def wrapper(self: "Crawler", args: Any, *kwargs: Any) -> None:`
			`for _ in range(attempts - 1):`
			`try:`
			`await f(self, args, *kwargs)`
			`return`
			`except Exception:`
			`pass`
			`await f(self, args, *kwargs)`
			`return wrapper # type: ignore`
			`return decorator`
Properly load crawler config 2021-05-05 23:45:10 +02:00

			`class CrawlerSection(Section):`
			`def output_dir(self, name: str) -> Path:`
Add local file crawler 2021-05-06 01:02:40 +02:00			`return Path(self.s.get("output_dir", name)).expanduser()`
Properly load crawler config 2021-05-05 23:45:10 +02:00
			`def redownload(self) -> Redownload:`
			`value = self.s.get("redownload", "never-smart")`
			`if value == "never":`
			`return Redownload.NEVER`
			`elif value == "never-smart":`
			`return Redownload.NEVER_SMART`
			`elif value == "always":`
			`return Redownload.ALWAYS`
			`elif value == "always-smart":`
			`return Redownload.ALWAYS_SMART`
Add reasons for invalid values 2021-05-15 00:39:55 +02:00
			`self.invalid_value(`
			`"redownload",`
			`value,`
			`"Must be 'never', 'never-smart', 'always' or 'always-smart'"`
			`)`
Properly load crawler config 2021-05-05 23:45:10 +02:00
			`def on_conflict(self) -> OnConflict:`
			`value = self.s.get("on_conflict", "prompt")`
			`if value == "prompt":`
			`return OnConflict.PROMPT`
			`elif value == "local-first":`
			`return OnConflict.LOCAL_FIRST`
			`elif value == "remote-first":`
			`return OnConflict.REMOTE_FIRST`
			`elif value == "no-delete":`
			`return OnConflict.NO_DELETE`
Add reasons for invalid values 2021-05-15 00:39:55 +02:00
			`self.invalid_value(`
			`"on_conflict",`
			`value,`
			`"Must be 'prompt', 'local-first', 'remote-first' or 'no-delete'",`
			`)`
Properly load crawler config 2021-05-05 23:45:10 +02:00
			`def transform(self) -> str:`
			`return self.s.get("transform", "")`

Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`def max_concurrent_crawls(self) -> int:`
			`value = self.s.getint("max_concurrent_crawls", fallback=1)`
			`if value <= 0:`
			`self.invalid_value("max_concurrent_crawls", value,`
			`"Must be greater than 0")`
			`return value`

			`def max_concurrent_downloads(self) -> int:`
			`value = self.s.getint("max_concurrent_downloads", fallback=1)`

			`if value <= 0:`
			`self.invalid_value("max_concurrent_downloads", value,`
			`"Must be greater than 0")`
			`return value`

			`def request_delay(self) -> float:`
			`value = self.s.getfloat("request_delay", fallback=0.0)`
			`if value < 0:`
			`self.invalid_value("request_delay", value,`
			`"Must be greater than or equal to 0")`
			`return value`

Let crawlers obtain authenticators 2021-05-13 18:57:20 +02:00			`def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:`
			`value = self.s.get("auth")`
			`if value is None:`
			`self.missing_value("auth")`
			`auth = authenticators.get(f"auth:{value}")`
			`if auth is None:`
Add reasons for invalid values 2021-05-15 00:39:55 +02:00			`self.invalid_value("auth", value, "No such auth section exists")`
Let crawlers obtain authenticators 2021-05-13 18:57:20 +02:00			`return auth`

Properly load crawler config 2021-05-05 23:45:10 +02:00
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`class Crawler(ABC):`
Load crawlers from config file 2021-04-30 16:22:14 +02:00			`def __init__(`
			`self,`
			`name: str,`
Properly load crawler config 2021-05-05 23:45:10 +02:00			`section: CrawlerSection,`
Use global conductor instance The switch from crawler-local conductors to a single pferd-global conductor was made to prepare for auth section credential providers. 2021-05-10 23:50:16 +02:00			`config: Config,`
			`conductor: TerminalConductor,`
Load crawlers from config file 2021-04-30 16:22:14 +02:00			`) -> None:`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`"""`
Document crawler 2021-04-29 15:43:20 +02:00			`Initialize a crawler from its name and its section in the config file.`

			`If you are writing your own constructor for your own crawler, make sure`
			`to call this constructor first (via super().__init__).`

Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`May throw a CrawlerLoadException.`
			`"""`

			`self.name = name`
Use global conductor instance The switch from crawler-local conductors to a single pferd-global conductor was made to prepare for auth section credential providers. 2021-05-10 23:50:16 +02:00			`self._conductor = conductor`
			`self.error_free = True`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00
Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`self._limiter = Limiter(`
			`crawl_limit=section.max_concurrent_crawls(),`
			`download_limit=section.max_concurrent_downloads(),`
			`delay=section.request_delay(),`
			`)`

Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`try:`
Properly load crawler config 2021-05-05 23:45:10 +02:00			`self._transformer = Transformer(section.transform())`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`except RuleParseException as e:`
			`e.pretty_print()`
			`raise CrawlerLoadException()`

Implement output directory 2021-05-05 18:08:34 +02:00			`self._output_dir = OutputDirectory(`
Properly load crawler config 2021-05-05 23:45:10 +02:00			`config.working_dir / section.output_dir(name),`
			`section.redownload(),`
			`section.on_conflict(),`
			`self._conductor,`
			`)`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00
			`def print(self, text: str) -> None:`
Document crawler 2021-04-29 15:43:20 +02:00			`"""`
			`Print rich markup to the terminal. Crawlers must use this function to`
			`print things unless they are holding an exclusive output context`
			`manager! Be careful to escape all user-supplied strings.`
			`"""`

Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`self._conductor.print(text)`

Fix mypy errors 2021-04-29 15:47:52 +02:00			`def exclusive_output(self) -> AsyncContextManager[None]:`
Document crawler 2021-04-29 15:43:20 +02:00			`"""`
			`Acquire exclusive rights™ to the terminal output. While this context`
			`manager is held, output such as printing and progress bars from other`
			`threads is suspended and the current thread may do whatever it wants`
			`with the terminal. However, it must return the terminal to its original`
			`state before exiting the context manager.`

			`No two threads can hold this context manager at the same time.`

			`Useful for password or confirmation prompts as well as running other`
			`programs while crawling (e. g. to get certain credentials).`
			`"""`

Test and fix exclusive output 2021-04-29 15:26:10 +02:00			`return self._conductor.exclusive_output()`

Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`@asynccontextmanager`
Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`async def crawl_bar(`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`self,`
Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`path: PurePath,`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`total: Optional[int] = None,`
			`) -> AsyncIterator[ProgressBar]:`
Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"`
			`async with self._limiter.limit_crawl():`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`with self._conductor.progress_bar(desc, total=total) as bar:`
			`yield bar`

Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`@asynccontextmanager`
			`async def download_bar(`
Make progress bars easier to use The crawler now supports two types of progress bars 2021-04-29 13:53:16 +02:00			`self,`
Use PurePath instead of Path Path should only be used when we need to access the file system. For all other purposes (mainly crawling), we use PurePath instead since the paths don't correspond to paths in the local file system. 2021-04-29 16:52:00 +02:00			`path: PurePath,`
Add local file crawler 2021-05-06 01:02:40 +02:00			`total: Optional[int] = None,`
Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. 2021-05-15 00:38:46 +02:00			`) -> AsyncIterator[ProgressBar]:`
			`desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"`
			`async with self._limiter.limit_download():`
			`with self._conductor.progress_bar(desc, total=total) as bar:`
			`yield bar`
Add local file crawler 2021-05-06 01:02:40 +02:00
			`async def download(`
			`self,`
			`path: PurePath,`
			`mtime: Optional[datetime] = None,`
			`redownload: Optional[Redownload] = None,`
			`on_conflict: Optional[OnConflict] = None,`
			`) -> Optional[AsyncContextManager[FileSink]]:`
			`return await self._output_dir.download(`
			`path, mtime, redownload, on_conflict)`

			`async def cleanup(self) -> None:`
Fix asynchronous methods being not awaited 2021-05-13 19:39:49 +02:00			`await self._output_dir.cleanup()`
Make progress bars easier to use The crawler now supports two types of progress bars 2021-04-29 13:53:16 +02:00
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`async def run(self) -> None:`
Document crawler 2021-04-29 15:43:20 +02:00			`"""`
			`Start the crawling process. Call this function if you want to use a`
			`crawler.`
			`"""`

Use conductor via context manager 2021-04-29 14:23:28 +02:00			`async with self._conductor:`
Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`await self.crawl()`

			`@abstractmethod`
			`async def crawl(self) -> None:`
Document crawler 2021-04-29 15:43:20 +02:00			`"""`
			`Overwrite this function if you are writing a crawler.`

			`This function must not return before all crawling is complete. To crawl`
			`multiple things concurrently, asyncio.gather can be used.`
			`"""`

Implement Crawler and DummyCrawler 2021-04-29 13:44:29 +02:00			`pass`
Add HttpCrawler 2021-05-13 22:28:14 +02:00

			`class HttpCrawler(Crawler):`
			`COOKIE_FILE = PurePath(".cookies")`

			`def __init__(`
			`self,`
			`name: str,`
			`section: CrawlerSection,`
			`config: Config,`
			`conductor: TerminalConductor,`
			`) -> None:`
			`super().__init__(name, section, config, conductor)`

			`self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)`
			`self._output_dir.register_reserved(self.COOKIE_FILE)`

			`async def run(self) -> None:`
			`cookie_jar = aiohttp.CookieJar()`

			`try:`
			`cookie_jar.load(self._cookie_jar_path)`
			`except Exception:`
			`pass`

Set user agent to "pferd/<version>" 2021-05-14 00:09:58 +02:00			`async with aiohttp.ClientSession(`
			`headers={"User-Agent": f"pferd/{__version__}"},`
			`cookie_jar=cookie_jar,`
			`) as session:`
Add HttpCrawler 2021-05-13 22:28:14 +02:00			`self.session = session`
			`try:`
			`await super().run()`
			`finally:`
			`del self.session`

			`try:`
			`cookie_jar.save(self._cookie_jar_path)`
			`except Exception:`
Set user agent to "pferd/<version>" 2021-05-14 00:09:58 +02:00			`self.print(`
			`"[bold red]Warning:[/] Failed to save cookies to "`
			`+ escape(str(self.COOKIE_FILE))`
			`)`