From b54b3b979c41204a51f0d7f02de7f55a0031ba3e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 27 Aug 2023 11:42:25 +0200 Subject: [PATCH] Remove size suffix for content pages --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d58ea18..0e93f01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ ambiguous situations. - Crawling of file and custom opencast cards - Crawling of button cards without descriptions - Abort crawling when encountering an unexpected ilias root page redirect +- Remove size suffix for files in content pages ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d5ea76d..c0807d3 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -377,7 +377,8 @@ class IliasPage: for link in links: url = self._abs_url_from_link(link) - name = _sanitize_path_name(link.getText().strip().replace("\t", "")) + name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.getText()).strip().replace("\t", "") + name = _sanitize_path_name(name) if "file_id" not in url: _unexpected_html_warning()