From df3514cd0350fd6ef9231cadb236c930c99b89db Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 29 Aug 2023 12:30:54 +0200 Subject: [PATCH] Crawl paginated past meetings --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae809e3..3f318b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ ambiguous situations. - Abort crawling when encountering an unexpected ilias root page redirect - Remove size suffix for files in content pages - Sanitize ascii control characters on Windows +- Crawling of paginated past meetings ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index a8fcecb..5a94a0b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -293,7 +293,10 @@ class IliasPage: return self._uncollapse_future_meetings_url() is not None def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: - element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) + element = self._soup.find( + "a", + attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)} + ) if not element: return None link = self._abs_url_from_link(element) @@ -991,7 +994,11 @@ class IliasPage: if img_tag is None: img_tag = found_parent.select_one("img.icon") - if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): + is_session_expansion_button = found_parent.find( + "a", + attrs={"href": lambda x: x and ("crs_next_sess=" in x or "crs_prev_sess=" in x)} + ) + if img_tag is None and is_session_expansion_button: log.explain("Found session expansion button, skipping it as it has no content") return None