""" Utility functions and a scraper/downloader for the KIT DIVA portal. """ import logging import re from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, List, Optional import requests from .errors import FatalException from .logging import PrettyLogger from .organizer import Organizer from .tmp_dir import TmpDir from .transform import Transformable from .utils import stream_to_path LOGGER = logging.getLogger(__name__) PRETTY = PrettyLogger(LOGGER) @dataclass class DivaDownloadInfo(Transformable): """ Information about a DIVA video """ url: str DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool] def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool: """ Accepts only new files. """ resolved_file = organizer.resolve(info.path) if not resolved_file.exists(): return True PRETTY.ignored_file(info.path, "local file exists") return False class DivaPlaylistCrawler: # pylint: disable=too-few-public-methods """ A crawler for DIVA playlists. """ _PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/" _COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json" def __init__(self, playlist_id: str): self._id = playlist_id @classmethod def fetch_id(cls, playlist_link: str) -> str: """ Fetches the ID for a playerlist, given the base link (e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271). Raises a FatalException, if the id can not be resolved """ match = re.match(r".+#/details/(.+)", playlist_link) if match is None: raise FatalException( "DIVA: Invalid playlist link format, could not extract details." ) base_name = match.group(1) response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json") if response.status_code != 200: raise FatalException( f"DIVA: Got non-200 status code ({response.status_code}))" f"when requesting {response.url!r}!" ) body = response.json() if body["error"]: raise FatalException(f"DIVA: Server returned error {body['error']!r}.") return body["result"]["collection"]["id"] def crawl(self) -> List[DivaDownloadInfo]: """ Crawls the playlist given in the constructor. """ response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) if response.status_code != 200: raise FatalException(f"Server returned status {response.status_code}.") body = response.json() if body["error"]: raise FatalException(f"Server returned error {body['error']!r}.") result = body["result"] if result["resultCount"] > result["pageSize"]: PRETTY.warning("Did not receive all results, some will be missing") download_infos: List[DivaDownloadInfo] = [] for video in result["resultList"]: title = video["title"] collection_title = self._follow_path(["collection", "title"], video) url = self._follow_path( ["resourceList", "derivateList", "mp4", "url"], video ) if url and collection_title and title: path = Path(collection_title, title + ".mp4") download_infos.append(DivaDownloadInfo(path, url)) else: PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}") return download_infos @staticmethod def _follow_path(path: List[str], obj: Any) -> Optional[Any]: """ Follows a property path through an object, bailing at the first None. """ current = obj for path_step in path: if path_step in current: current = current[path_step] else: return None return current class DivaDownloader: """ A downloader for DIVA videos. """ def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy): self._tmp_dir = tmp_dir self._organizer = organizer self._strategy = strategy self._session = requests.session() def download_all(self, infos: List[DivaDownloadInfo]) -> None: """ Download multiple files one after the other. """ for info in infos: self.download(info) def download(self, info: DivaDownloadInfo) -> None: """ Download a single file. """ if not self._strategy(self._organizer, info): self._organizer.mark(info.path) return with self._session.get(info.url, stream=True) as response: if response.status_code == 200: tmp_file = self._tmp_dir.new_path() stream_to_path(response, tmp_file, info.path.name) self._organizer.accept_file(tmp_file, info.path) else: PRETTY.warning(f"Could not download file, got response {response.status_code}")