Move video url extraction logic to crawler

This commit is contained in:
I-Al-Istannen 2020-05-30 00:14:08 +02:00
parent b969a1854a
commit 821c7ade26
2 changed files with 53 additions and 52 deletions

View File

@ -3,6 +3,7 @@ Contains an ILIAS crawler alongside helper functions.
""" """
import datetime import datetime
import json
import logging import logging
import re import re
from enum import Enum from enum import Enum
@ -380,22 +381,44 @@ class IliasCrawler:
video_path: Path = Path(parent_path, title) video_path: Path = Path(parent_path, title)
video_url = self._abs_url_from_link(link)
# The video had a direct download button we can use instead # The video had a direct download button we can use instead
if direct_download: if direct_download:
LOGGER.debug("Using direct download for video %r", str(video_path)) LOGGER.debug("Using direct download for video %r", str(video_path))
return [IliasDownloadInfo( return [IliasDownloadInfo(video_path, video_url, modification_time)]
video_path,
self._abs_url_from_link(link),
modification_time
)]
return [IliasDownloadInfo( return [IliasDownloadInfo(
video_path, video_path,
self._abs_url_from_link(link), self._crawl_video_url_from_play_link(video_url),
modification_time, modification_time
unpack_video=True
)] )]
def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]:
def inner() -> Optional[str]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
# on the page, but defined in a JS object inside a script tag, passed to the player
# library.
# We do the impossible and RegEx the stream JSON object out of the page's HTML source
video_page_soup = soupify(self._session.get(play_url))
regex: re.Pattern = re.compile(
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
)
json_match = regex.search(str(video_page_soup))
if json_match is None:
PRETTY.warning(f"Could not find json stream info for {play_url!r}")
return None
json_str = json_match.group(1)
# parse it
json_object = json.loads(json_str)
# and fetch the video url!
video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"]
return video_url
return inner
def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasDownloadInfo]: def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasDownloadInfo]:
""" """
Crawl files offered for download in exercises. Crawl files offered for download in exercises.

View File

@ -1,12 +1,9 @@
"""Contains a downloader for ILIAS.""" """Contains a downloader for ILIAS."""
import datetime import datetime
import json
import logging import logging
import re from pathlib import Path, PurePath
from dataclasses import dataclass from typing import Callable, List, Optional, Union
from pathlib import Path
from typing import Callable, List, Optional
import bs4 import bs4
import requests import requests
@ -26,15 +23,24 @@ class ContentTypeException(Exception):
"""Thrown when the content type of the ilias element can not be handled.""" """Thrown when the content type of the ilias element can not be handled."""
@dataclass
class IliasDownloadInfo(Transformable): class IliasDownloadInfo(Transformable):
""" """
This class describes a single file to be downloaded. This class describes a single file to be downloaded.
""" """
url: str def __init__(
modification_date: Optional[datetime.datetime] self,
unpack_video: bool = False path: PurePath,
url: Union[str, Callable[[], Optional[str]]],
modifcation_date: Optional[datetime.datetime]
):
super().__init__(path)
if isinstance(url, str):
string_url = url
self.url: Callable[[], Optional[str]] = lambda: string_url
else:
self.url = url
self.modification_date = modifcation_date
IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool] IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool]
@ -109,47 +115,19 @@ class IliasDownloader:
tmp_file = self._tmp_dir.new_path() tmp_file = self._tmp_dir.new_path()
while not self._download_file(info, tmp_file): while not self._try_download(info, tmp_file):
LOGGER.info("Retrying download: %r", info) LOGGER.info("Retrying download: %r", info)
self._authenticator.authenticate(self._session) self._authenticator.authenticate(self._session)
self._organizer.accept_file(tmp_file, info.path) self._organizer.accept_file(tmp_file, info.path)
def _download_file(self, info: IliasDownloadInfo, target: Path) -> bool:
if info.unpack_video:
return self._download_unpack_video(info, target)
return self._try_download(info, target)
def _download_unpack_video(self, info: IliasDownloadInfo, target: Path) -> bool:
# Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
# on the page, but defined in a JS object inside a script tag, passed to the player
# library.
# We do the impossible and RegEx the stream JSON object out of the page's HTML source
video_page_soup = soupify(self._session.get(info.url))
regex: re.Pattern = re.compile(
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
)
json_match = regex.search(str(video_page_soup))
if json_match is None:
PRETTY.warning(f"Could not find json stream info for {info.url!r}")
return True
json_str = json_match.group(1)
# parse it
json_object = json.loads(json_str)
# and fetch the video url!
video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"]
return self._try_download(
IliasDownloadInfo(info.path, video_url, info.modification_date),
target
)
def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
with self._session.get(info.url, stream=True) as response: url = info.url()
if url is None:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
return True
with self._session.get(url, stream=True) as response:
content_type = response.headers["content-type"] content_type = response.headers["content-type"]
if content_type.startswith("text/html"): if content_type.startswith("text/html"):