From fa71a9f44fe11a367a396b0cd80b745fe7ef6fe8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 20:15:55 +0100 Subject: [PATCH] Add support for mob videos in page descriptions --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/ilias_web_crawler.py | 16 ++++++++++--- PFERD/crawl/ilias/kit_ilias_html.py | 33 ++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e14f785..d9431bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Support for MOB videos in page descriptions + ### Changed - Remove videos from description pages diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 08add07..73fed9c 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -389,6 +389,8 @@ instance's greatest bottleneck. return await self._handle_opencast_video(element, element_path) elif element.type == IliasElementType.MEDIACAST_VIDEO: return await self._handle_file(element, element_path) + elif element.type == IliasElementType.MOB_VIDEO: + return await self._handle_file(element, element_path, is_video=True) elif element.type in _DIRECTORY_PAGES: return await self._handle_ilias_page(element.url, element, element_path) else: @@ -631,18 +633,19 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, + is_video: bool = False, ) -> Optional[Coroutine[Any, Any, None]]: maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return None - return self._download_file(element, maybe_dl) + return self._download_file(element, maybe_dl, is_video) @_iorepeat(3, "downloading file") @anoncritical - async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: + async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None: assert dl # The function is only reached when dl is not None async with dl as (bar, sink): - await self._stream_from_url(element.url, sink, bar, is_video=False) + await self._stream_from_url(element.url, sink, bar, is_video) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def try_stream() -> bool: @@ -671,6 +674,13 @@ instance's greatest bottleneck. if is_video and "html" in resp.content_type: return False + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Range + if content_range := resp.headers.get(hdrs.CONTENT_RANGE, default=None): + parts = content_range.split("/") + if len(parts) == 2 and parts[1].isdigit(): + bar.set_total(int(parts[1])) + + # Prefer the content length header if resp.content_length: bar.set_total(resp.content_length) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 31107cf..e0c87ad 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -28,6 +28,7 @@ class IliasElementType(Enum): MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEETING = "meeting" + MOB_VIDEO = "mob_video" OPENCAST_VIDEO = "opencast_video" OPENCAST_VIDEO_FOLDER = "opencast_video_folder" OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" @@ -745,6 +746,7 @@ class IliasPage: result += self._find_cards() result += self._find_mediacast_videos() + result += self._find_mob_videos() return result @@ -773,6 +775,37 @@ class IliasPage: return videos + def _find_mob_videos(self) -> List[IliasPageElement]: + videos: List[IliasPageElement] = [] + + for figure in self._soup.select("figure.ilc_media_cont_MediaContainerHighlighted"): + title = figure.select_one("figcaption").getText().strip() + ".mp4" + video_element = figure.select_one("video") + if not video_element: + _unexpected_html_warning() + log.warn_contd(f"No