pferd/PFERD/diva.py
2020-05-11 00:25:34 +02:00

170 lines
5.1 KiB
Python

"""
Utility functions and a scraper/downloader for the KIT DIVA portal.
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, List, Optional
import requests
from .errors import FatalException
from .logging import PrettyLogger
from .organizer import Organizer
from .tmp_dir import TmpDir
from .transform import Transformable
from .utils import stream_to_path
LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)
@dataclass
class DivaDownloadInfo(Transformable):
"""
Information about a DIVA video
"""
url: str
DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool]
def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool:
"""
Accepts only new files.
"""
resolved_file = organizer.resolve(info.path)
if not resolved_file.exists():
return True
PRETTY.ignored_file(info.path, "local file exists")
return False
class DivaPlaylistCrawler:
# pylint: disable=too-few-public-methods
"""
A crawler for DIVA playlists.
"""
_PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/"
_COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json"
def __init__(self, playlist_id: str):
self._id = playlist_id
@classmethod
def fetch_id(cls, playlist_link: str) -> str:
"""
Fetches the ID for a playerlist, given the base link
(e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271).
Raises a FatalException, if the id can not be resolved
"""
match = re.match(r".+#/details/(.+)", playlist_link)
if match is None:
raise FatalException(
"DIVA: Invalid playlist link format, could not extract details."
)
base_name = match.group(1)
response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json")
if response.status_code != 200:
raise FatalException(
f"DIVA: Got non-200 status code ({response.status_code}))"
f"when requesting {response.url!r}!"
)
body = response.json()
if body["error"]:
raise FatalException(f"DIVA: Server returned error {body['error']!r}.")
return body["result"]["collection"]["id"]
def crawl(self) -> List[DivaDownloadInfo]:
"""
Crawls the playlist given in the constructor.
"""
response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id})
if response.status_code != 200:
raise FatalException(f"Server returned status {response.status_code}.")
body = response.json()
if body["error"]:
raise FatalException(f"Server returned error {body['error']!r}.")
result = body["result"]
if result["resultCount"] > result["pageSize"]:
PRETTY.warning("Did not receive all results, some will be missing")
download_infos: List[DivaDownloadInfo] = []
for video in result["resultList"]:
title = video["title"]
collection_title = self._follow_path(["collection", "title"], video)
url = self._follow_path(
["resourceList", "derivateList", "mp4", "url"],
video
)
if url and collection_title and title:
path = Path(collection_title, title + ".mp4")
download_infos.append(DivaDownloadInfo(path, url))
else:
PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}")
return download_infos
@staticmethod
def _follow_path(path: List[str], obj: Any) -> Optional[Any]:
"""
Follows a property path through an object, bailing at the first None.
"""
current = obj
for path_step in path:
if path_step in current:
current = current[path_step]
else:
return None
return current
class DivaDownloader:
"""
A downloader for DIVA videos.
"""
def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy):
self._tmp_dir = tmp_dir
self._organizer = organizer
self._strategy = strategy
self._session = requests.session()
def download_all(self, infos: List[DivaDownloadInfo]) -> None:
"""
Download multiple files one after the other.
"""
for info in infos:
self.download(info)
def download(self, info: DivaDownloadInfo) -> None:
"""
Download a single file.
"""
if not self._strategy(self._organizer, info):
self._organizer.mark(info.path)
return
with self._session.get(info.url, stream=True) as response:
if response.status_code == 200:
tmp_file = self._tmp_dir.new_path()
stream_to_path(response, tmp_file, info.path.name)
self._organizer.accept_file(tmp_file, info.path)
else:
PRETTY.warning(f"Could not download file, got response {response.status_code}")