mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-25 02:52:31 +02:00 
			
		
		
		
	Move video url extraction logic to crawler
This commit is contained in:
		| @@ -1,12 +1,9 @@ | ||||
| """Contains a downloader for ILIAS.""" | ||||
|  | ||||
| import datetime | ||||
| import json | ||||
| import logging | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from pathlib import Path | ||||
| from typing import Callable, List, Optional | ||||
| from pathlib import Path, PurePath | ||||
| from typing import Callable, List, Optional, Union | ||||
|  | ||||
| import bs4 | ||||
| import requests | ||||
| @@ -26,15 +23,24 @@ class ContentTypeException(Exception): | ||||
|     """Thrown when the content type of the ilias element can not be handled.""" | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class IliasDownloadInfo(Transformable): | ||||
|     """ | ||||
|     This class describes a single file to be downloaded. | ||||
|     """ | ||||
|  | ||||
|     url: str | ||||
|     modification_date: Optional[datetime.datetime] | ||||
|     unpack_video: bool = False | ||||
|     def __init__( | ||||
|             self, | ||||
|             path: PurePath, | ||||
|             url: Union[str, Callable[[], Optional[str]]], | ||||
|             modifcation_date: Optional[datetime.datetime] | ||||
|     ): | ||||
|         super().__init__(path) | ||||
|         if isinstance(url, str): | ||||
|             string_url = url | ||||
|             self.url: Callable[[], Optional[str]] = lambda: string_url | ||||
|         else: | ||||
|             self.url = url | ||||
|         self.modification_date = modifcation_date | ||||
|  | ||||
|  | ||||
| IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool] | ||||
| @@ -109,47 +115,19 @@ class IliasDownloader: | ||||
|  | ||||
|         tmp_file = self._tmp_dir.new_path() | ||||
|  | ||||
|         while not self._download_file(info, tmp_file): | ||||
|         while not self._try_download(info, tmp_file): | ||||
|             LOGGER.info("Retrying download: %r", info) | ||||
|             self._authenticator.authenticate(self._session) | ||||
|  | ||||
|         self._organizer.accept_file(tmp_file, info.path) | ||||
|  | ||||
|     def _download_file(self, info: IliasDownloadInfo, target: Path) -> bool: | ||||
|         if info.unpack_video: | ||||
|             return self._download_unpack_video(info, target) | ||||
|  | ||||
|         return self._try_download(info, target) | ||||
|  | ||||
|     def _download_unpack_video(self, info: IliasDownloadInfo, target: Path) -> bool: | ||||
|         # Fetch the actual video page. This is a small wrapper page initializing a javscript | ||||
|         # player. Sadly we can not execute that JS. The actual video stream url is nowhere | ||||
|         # on the page, but defined in a JS object inside a script tag, passed to the player | ||||
|         # library. | ||||
|         # We do the impossible and RegEx the stream JSON object out of the page's HTML source | ||||
|         video_page_soup = soupify(self._session.get(info.url)) | ||||
|         regex: re.Pattern = re.compile( | ||||
|             r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE | ||||
|         ) | ||||
|         json_match = regex.search(str(video_page_soup)) | ||||
|  | ||||
|         if json_match is None: | ||||
|             PRETTY.warning(f"Could not find json stream info for {info.url!r}") | ||||
|             return True | ||||
|         json_str = json_match.group(1) | ||||
|  | ||||
|         # parse it | ||||
|         json_object = json.loads(json_str) | ||||
|         # and fetch the video url! | ||||
|         video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] | ||||
|  | ||||
|         return self._try_download( | ||||
|             IliasDownloadInfo(info.path, video_url, info.modification_date), | ||||
|             target | ||||
|         ) | ||||
|  | ||||
|     def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: | ||||
|         with self._session.get(info.url, stream=True) as response: | ||||
|         url = info.url() | ||||
|         if url is None: | ||||
|             PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") | ||||
|             return True | ||||
|  | ||||
|         with self._session.get(url, stream=True) as response: | ||||
|             content_type = response.headers["content-type"] | ||||
|  | ||||
|             if content_type.startswith("text/html"): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 I-Al-Istannen
					I-Al-Istannen