pferd/PFERD/diva.py

"""
Utility functions and a scraper/downloader for the KIT DIVA portal.
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, List, Optional

import requests

from .errors import FatalException
from .logging import PrettyLogger
from .organizer import Organizer
from .tmp_dir import TmpDir
from .transform import Transformable
from .utils import stream_to_path

LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)


@dataclass
class DivaDownloadInfo(Transformable):
    """
    Information about a DIVA video
    """
    url: str


DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool]


def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool:
    """
    Accepts only new files.
    """
    resolved_file = organizer.resolve(info.path)
    if not resolved_file.exists():
        return True
    PRETTY.ignored_file(info.path, "local file exists")
    return False


class DivaPlaylistCrawler:
    # pylint: disable=too-few-public-methods
    """
    A crawler for DIVA playlists.
    """

    _PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/"
    _COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json"

    def __init__(self, playlist_id: str):
        self._id = playlist_id

    @classmethod
    def fetch_id(cls, playlist_link: str) -> str:
        """
        Fetches the ID for a playerlist, given the base link
        (e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271).

        Raises a FatalException, if the id can not be resolved
        """
        match = re.match(r".+#/details/(.+)", playlist_link)
        if match is None:
            raise FatalException(
                "DIVA: Invalid playlist link format, could not extract details."
            )
        base_name = match.group(1)

        response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json")

        if response.status_code != 200:
            raise FatalException(
                f"DIVA: Got non-200 status code ({response.status_code}))"
                f"when requesting {response.url!r}!"
            )

        body = response.json()

        if body["error"]:
            raise FatalException(f"DIVA: Server returned error {body['error']!r}.")

        return body["result"]["id"]

    def crawl(self) -> List[DivaDownloadInfo]:
        """
        Crawls the playlist given in the constructor.
        """
        response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id})
        if response.status_code != 200:
            raise FatalException(f"Server returned status {response.status_code}.")

        body = response.json()

        if body["error"]:
            raise FatalException(f"Server returned error {body['error']!r}.")

        result = body["result"]

        if result["resultCount"] > result["pageSize"]:
            PRETTY.warning("Did not receive all results, some will be missing")

        download_infos: List[DivaDownloadInfo] = []

        for video in result["resultList"]:
            title = video["title"]
            collection_title = self._follow_path(["collection", "title"], video)
            url = self._follow_path(
                ["resourceList", "derivateList", "mp4", "url"],
                video
            )

            if url and collection_title and title:
                path = Path(collection_title, title + ".mp4")
                download_infos.append(DivaDownloadInfo(path, url))
            else:
                PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}")

        return download_infos

    @staticmethod
    def _follow_path(path: List[str], obj: Any) -> Optional[Any]:
        """
        Follows a property path through an object, bailing at the first None.
        """
        current = obj
        for path_step in path:
            if path_step in current:
                current = current[path_step]
            else:
                return None
        return current


class DivaDownloader:
    """
    A downloader for DIVA videos.
    """

    def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy):
        self._tmp_dir = tmp_dir
        self._organizer = organizer
        self._strategy = strategy
        self._session = requests.session()

    def download_all(self, infos: List[DivaDownloadInfo]) -> None:
        """
        Download multiple files one after the other.
        """
        for info in infos:
            self.download(info)

    def download(self, info: DivaDownloadInfo) -> None:
        """
        Download a single file.
        """
        if not self._strategy(self._organizer, info):
            self._organizer.mark(info.path)
            return

        with self._session.get(info.url, stream=True) as response:
            if response.status_code == 200:
                tmp_file = self._tmp_dir.new_path()
                stream_to_path(response, tmp_file, info.path.name)
                self._organizer.accept_file(tmp_file, info.path)
            else:
                PRETTY.warning(f"Could not download file, got response {response.status_code}")
Warn when a marked file is added again 2020-05-10 21:37:48 +02:00			`"""`
			`Utility functions and a scraper/downloader for the KIT DIVA portal.`
			`"""`
Added a diva playlist downloader 2020-04-30 16:24:38 +02:00			`import logging`
Allow passing a playlist URL to diva instead of an id 2020-05-10 11:11:28 +02:00			`import re`
Added a diva playlist downloader 2020-04-30 16:24:38 +02:00			`from dataclasses import dataclass`
			`from pathlib import Path`
			`from typing import Any, Callable, List, Optional`

			`import requests`

Move FatalException to errors.py 2020-05-09 00:00:21 +02:00			`from .errors import FatalException`
			`from .logging import PrettyLogger`
Added a diva playlist downloader 2020-04-30 16:24:38 +02:00			`from .organizer import Organizer`
			`from .tmp_dir import TmpDir`
			`from .transform import Transformable`
			`from .utils import stream_to_path`

			`LOGGER = logging.getLogger(__name__)`
			`PRETTY = PrettyLogger(LOGGER)`


			`@dataclass`
			`class DivaDownloadInfo(Transformable):`
			`"""`
			`Information about a DIVA video`
			`"""`
			`url: str`


			`DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool]`


			`def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool:`
			`"""`
			`Accepts only new files.`
			`"""`
			`resolved_file = organizer.resolve(info.path)`
			`if not resolved_file.exists():`
			`return True`
			`PRETTY.ignored_file(info.path, "local file exists")`
			`return False`


			`class DivaPlaylistCrawler:`
			`# pylint: disable=too-few-public-methods`
			`"""`
			`A crawler for DIVA playlists.`
			`"""`

Allow passing a playlist URL to diva instead of an id 2020-05-10 11:11:28 +02:00			`_PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/"`
			`_COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json"`
Added a diva playlist downloader 2020-04-30 16:24:38 +02:00
			`def __init__(self, playlist_id: str):`
			`self._id = playlist_id`

Allow passing a playlist URL to diva instead of an id 2020-05-10 11:11:28 +02:00			`@classmethod`
			`def fetch_id(cls, playlist_link: str) -> str:`
			`"""`
			`Fetches the ID for a playerlist, given the base link`
			`(e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271).`

			`Raises a FatalException, if the id can not be resolved`
			`"""`
			`match = re.match(r".+#/details/(.+)", playlist_link)`
			`if match is None:`
			`raise FatalException(`
			`"DIVA: Invalid playlist link format, could not extract details."`
			`)`
			`base_name = match.group(1)`

			`response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json")`

			`if response.status_code != 200:`
			`raise FatalException(`
			`f"DIVA: Got non-200 status code ({response.status_code}))"`
			`f"when requesting {response.url!r}!"`
			`)`

			`body = response.json()`

			`if body["error"]:`
			`raise FatalException(f"DIVA: Server returned error {body['error']!r}.")`

			`return body["result"]["id"]`

Added a diva playlist downloader 2020-04-30 16:24:38 +02:00			`def crawl(self) -> List[DivaDownloadInfo]:`
			`"""`
			`Crawls the playlist given in the constructor.`
			`"""`
Allow passing a playlist URL to diva instead of an id 2020-05-10 11:11:28 +02:00			`response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id})`
Added a diva playlist downloader 2020-04-30 16:24:38 +02:00			`if response.status_code != 200:`
			`raise FatalException(f"Server returned status {response.status_code}.")`

			`body = response.json()`

			`if body["error"]:`
			`raise FatalException(f"Server returned error {body['error']!r}.")`

			`result = body["result"]`

			`if result["resultCount"] > result["pageSize"]:`
			`PRETTY.warning("Did not receive all results, some will be missing")`

			`download_infos: List[DivaDownloadInfo] = []`

			`for video in result["resultList"]:`
			`title = video["title"]`
			`collection_title = self._follow_path(["collection", "title"], video)`
			`url = self._follow_path(`
			`["resourceList", "derivateList", "mp4", "url"],`
			`video`
			`)`

			`if url and collection_title and title:`
			`path = Path(collection_title, title + ".mp4")`
			`download_infos.append(DivaDownloadInfo(path, url))`
			`else:`
			`PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}")`

			`return download_infos`

			`@staticmethod`
			`def _follow_path(path: List[str], obj: Any) -> Optional[Any]:`
			`"""`
			`Follows a property path through an object, bailing at the first None.`
			`"""`
			`current = obj`
			`for path_step in path:`
			`if path_step in current:`
			`current = current[path_step]`
			`else:`
			`return None`
			`return current`


			`class DivaDownloader:`
			`"""`
			`A downloader for DIVA videos.`
			`"""`

			`def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy):`
			`self._tmp_dir = tmp_dir`
			`self._organizer = organizer`
			`self._strategy = strategy`
			`self._session = requests.session()`

			`def download_all(self, infos: List[DivaDownloadInfo]) -> None:`
			`"""`
			`Download multiple files one after the other.`
			`"""`
			`for info in infos:`
			`self.download(info)`

			`def download(self, info: DivaDownloadInfo) -> None:`
			`"""`
			`Download a single file.`
			`"""`
			`if not self._strategy(self._organizer, info):`
			`self._organizer.mark(info.path)`
			`return`

			`with self._session.get(info.url, stream=True) as response:`
			`if response.status_code == 200:`
			`tmp_file = self._tmp_dir.new_path()`
Add a download progress bar 2020-05-08 00:26:33 +02:00			`stream_to_path(response, tmp_file, info.path.name)`
Added a diva playlist downloader 2020-04-30 16:24:38 +02:00			`self._organizer.accept_file(tmp_file, info.path)`
			`else:`
			`PRETTY.warning(f"Could not download file, got response {response.status_code}")`