From bee3d7099839542be9a1a514144204cbe4cf56cf Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 30 Apr 2020 16:24:38 +0200 Subject: [PATCH] Added a diva playlist downloader --- PFERD/diva.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++++ PFERD/pferd.py | 66 ++++++++++++++++++++++-- 2 files changed, 194 insertions(+), 5 deletions(-) create mode 100644 PFERD/diva.py diff --git a/PFERD/diva.py b/PFERD/diva.py new file mode 100644 index 0000000..bfd0299 --- /dev/null +++ b/PFERD/diva.py @@ -0,0 +1,133 @@ +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, List, Optional + +import requests + +from .logging import FatalException, PrettyLogger +from .organizer import Organizer +from .tmp_dir import TmpDir +from .transform import Transformable +from .utils import stream_to_path + +LOGGER = logging.getLogger(__name__) +PRETTY = PrettyLogger(LOGGER) + + +@dataclass +class DivaDownloadInfo(Transformable): + """ + Information about a DIVA video + """ + url: str + + +DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool] + + +def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool: + """ + Accepts only new files. + """ + resolved_file = organizer.resolve(info.path) + if not resolved_file.exists(): + return True + PRETTY.ignored_file(info.path, "local file exists") + return False + + +class DivaPlaylistCrawler: + # pylint: disable=too-few-public-methods + """ + A crawler for DIVA playlists. + """ + + _BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json" + + def __init__(self, playlist_id: str): + self._id = playlist_id + + def crawl(self) -> List[DivaDownloadInfo]: + """ + Crawls the playlist given in the constructor. + """ + response = requests.get(self._BASE_URL, params={"collection": self._id}) + if response.status_code != 200: + raise FatalException(f"Server returned status {response.status_code}.") + + body = response.json() + + if body["error"]: + raise FatalException(f"Server returned error {body['error']!r}.") + + result = body["result"] + + if result["resultCount"] > result["pageSize"]: + PRETTY.warning("Did not receive all results, some will be missing") + + download_infos: List[DivaDownloadInfo] = [] + + for video in result["resultList"]: + title = video["title"] + collection_title = self._follow_path(["collection", "title"], video) + url = self._follow_path( + ["resourceList", "derivateList", "mp4", "url"], + video + ) + + if url and collection_title and title: + path = Path(collection_title, title + ".mp4") + download_infos.append(DivaDownloadInfo(path, url)) + else: + PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}") + + return download_infos + + @staticmethod + def _follow_path(path: List[str], obj: Any) -> Optional[Any]: + """ + Follows a property path through an object, bailing at the first None. + """ + current = obj + for path_step in path: + if path_step in current: + current = current[path_step] + else: + return None + return current + + +class DivaDownloader: + """ + A downloader for DIVA videos. + """ + + def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy): + self._tmp_dir = tmp_dir + self._organizer = organizer + self._strategy = strategy + self._session = requests.session() + + def download_all(self, infos: List[DivaDownloadInfo]) -> None: + """ + Download multiple files one after the other. + """ + for info in infos: + self.download(info) + + def download(self, info: DivaDownloadInfo) -> None: + """ + Download a single file. + """ + if not self._strategy(self._organizer, info): + self._organizer.mark(info.path) + return + + with self._session.get(info.url, stream=True) as response: + if response.status_code == 200: + tmp_file = self._tmp_dir.new_path() + stream_to_path(response, tmp_file) + self._organizer.accept_file(tmp_file, info.path) + else: + PRETTY.warning(f"Could not download file, got response {response.status_code}") diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 327164b..62e9dd2 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -7,6 +7,8 @@ from pathlib import Path from typing import List, Optional, Union from .cookie_jar import CookieJar +from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, + diva_download_new) from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, IliasDownloader, IliasDownloadStrategy, KitShibbolethAuthenticator, download_modified_or_new) @@ -59,7 +61,8 @@ class Pferd(Location): dir_filter: IliasDirectoryFilter, transform: Transform, download_strategy: IliasDownloadStrategy, - ) -> None: + clean: bool = True + ) -> Organizer: # pylint: disable=too-many-locals cookie_jar = CookieJar(to_path(cookies) if cookies else None) session = cookie_jar.create_session() @@ -76,11 +79,15 @@ class Pferd(Location): transformed = apply_transform(transform, info) if self._test_run: self._print_transformables(transformed) - return + return organizer downloader.download_all(transformed) cookie_jar.save_cookies() - organizer.cleanup() + + if clean: + organizer.cleanup() + + return organizer def ilias_kit( self, @@ -92,7 +99,8 @@ class Pferd(Location): username: Optional[str] = None, password: Optional[str] = None, download_strategy: IliasDownloadStrategy = download_modified_or_new, - ) -> None: + clean: bool = True, + ) -> Organizer: """ Synchronizes a folder with the ILIAS instance of the KIT. @@ -116,11 +124,12 @@ class Pferd(Location): download_strategy {DownloadStrategy} -- A function to determine which files need to be downloaded. Can save bandwidth and reduce the number of requests. (default: {download_modified_or_new}) + clean {bool} -- Whether to clean up when the method finishes. """ # This authenticator only works with the KIT ilias instance. authenticator = KitShibbolethAuthenticator(username=username, password=password) PRETTY.starting_synchronizer(target, "ILIAS", course_id) - self._ilias( + return self._ilias( target=target, base_url="https://ilias.studium.kit.edu/", course_id=course_id, @@ -129,4 +138,51 @@ class Pferd(Location): dir_filter=dir_filter, transform=transform, download_strategy=download_strategy, + clean=clean, ) + + def diva_kit( + self, + target: Union[PathLike, Organizer], + playlist_id: str, + transform: Transform = lambda x: x, + download_strategy: DivaDownloadStrategy = diva_download_new, + clean: bool = True + ) -> Organizer: + """ + Synchronizes a folder with a DIVA playlist. + + Arguments: + organizer {Organizer} -- The organizer to use. + playlist_id {str} -- the playlist id + + Keyword Arguments: + transform {Transform} -- A transformation function for the output paths. Return None + to ignore a file. (default: {lambdax:x}) + download_strategy {DivaDownloadStrategy} -- A function to determine which files need to + be downloaded. Can save bandwidth and reduce the number of requests. + (default: {diva_download_new}) + clean {bool} -- Whether to clean up when the method finishes. + """ + tmp_dir = self._tmp_dir.new_subdir() + if isinstance(target, Organizer): + organizer = target + else: + organizer = Organizer(self.resolve(to_path(target))) + + crawler = DivaPlaylistCrawler(playlist_id) + downloader = DivaDownloader(tmp_dir, organizer, download_strategy) + + info = crawler.crawl() + + transformed = apply_transform(transform, info) + if self._test_run: + self._print_transformables(transformed) + return organizer + + downloader.download_all(transformed) + + if clean: + organizer.cleanup() + + return organizer