diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc6f06..f635719 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Remove videos from description pages + ### Fixed - Personal desktop/dashboard/favorites crawling - Crawling of nested courses diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5495304..0075784 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -85,6 +85,11 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: if isinstance(type(children[0]), Comment): dummy.decompose() + # Delete video figures, as they can not be internalized anyway + for video in soup.select(".ilc_media_cont_MediaContainerHighlighted .ilPageVideo"): + if figure := video.find_parent("figure"): + figure.decompose() + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): hrule_imposter.insert(0, soup.new_tag("hr"))