pferd/PFERD/diva.py

import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, List, Optional

import requests

from .logging import FatalException, PrettyLogger
from .organizer import Organizer
from .tmp_dir import TmpDir
from .transform import Transformable
from .utils import stream_to_path

LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)


@dataclass
class DivaDownloadInfo(Transformable):
    """
    Information about a DIVA video
    """
    url: str


DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool]


def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool:
    """
    Accepts only new files.
    """
    resolved_file = organizer.resolve(info.path)
    if not resolved_file.exists():
        return True
    PRETTY.ignored_file(info.path, "local file exists")
    return False


class DivaPlaylistCrawler:
    # pylint: disable=too-few-public-methods
    """
    A crawler for DIVA playlists.
    """

    _BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json"

    def __init__(self, playlist_id: str):
        self._id = playlist_id

    def crawl(self) -> List[DivaDownloadInfo]:
        """
        Crawls the playlist given in the constructor.
        """
        response = requests.get(self._BASE_URL, params={"collection": self._id})
        if response.status_code != 200:
            raise FatalException(f"Server returned status {response.status_code}.")

        body = response.json()

        if body["error"]:
            raise FatalException(f"Server returned error {body['error']!r}.")

        result = body["result"]

        if result["resultCount"] > result["pageSize"]:
            PRETTY.warning("Did not receive all results, some will be missing")

        download_infos: List[DivaDownloadInfo] = []

        for video in result["resultList"]:
            title = video["title"]
            collection_title = self._follow_path(["collection", "title"], video)
            url = self._follow_path(
                ["resourceList", "derivateList", "mp4", "url"],
                video
            )

            if url and collection_title and title:
                path = Path(collection_title, title + ".mp4")
                download_infos.append(DivaDownloadInfo(path, url))
            else:
                PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}")

        return download_infos

    @staticmethod
    def _follow_path(path: List[str], obj: Any) -> Optional[Any]:
        """
        Follows a property path through an object, bailing at the first None.
        """
        current = obj
        for path_step in path:
            if path_step in current:
                current = current[path_step]
            else:
                return None
        return current


class DivaDownloader:
    """
    A downloader for DIVA videos.
    """

    def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy):
        self._tmp_dir = tmp_dir
        self._organizer = organizer
        self._strategy = strategy
        self._session = requests.session()

    def download_all(self, infos: List[DivaDownloadInfo]) -> None:
        """
        Download multiple files one after the other.
        """
        for info in infos:
            self.download(info)

    def download(self, info: DivaDownloadInfo) -> None:
        """
        Download a single file.
        """
        if not self._strategy(self._organizer, info):
            self._organizer.mark(info.path)
            return

        with self._session.get(info.url, stream=True) as response:
            if response.status_code == 200:
                tmp_file = self._tmp_dir.new_path()
                stream_to_path(response, tmp_file, info.path.name)
                self._organizer.accept_file(tmp_file, info.path)
            else:
                PRETTY.warning(f"Could not download file, got response {response.status_code}")