From d7a2b6e019a994a9e18e00cffe14da2db763e025 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 19:32:16 +0100 Subject: [PATCH] Delete videos from course descriptions --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/ilias_html_cleaner.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc6f06..f635719 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Remove videos from description pages + ### Fixed - Personal desktop/dashboard/favorites crawling - Crawling of nested courses diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5495304..0075784 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -85,6 +85,11 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: if isinstance(type(children[0]), Comment): dummy.decompose() + # Delete video figures, as they can not be internalized anyway + for video in soup.select(".ilc_media_cont_MediaContainerHighlighted .ilPageVideo"): + if figure := video.find_parent("figure"): + figure.decompose() + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): hrule_imposter.insert(0, soup.new_tag("hr"))