Crawl paginated past meetings

This commit is contained in:
I-Al-Istannen 2023-08-29 12:30:54 +02:00
parent ad53185247
commit df3514cd03
2 changed files with 10 additions and 2 deletions

View File

@ -29,6 +29,7 @@ ambiguous situations.
- Abort crawling when encountering an unexpected ilias root page redirect - Abort crawling when encountering an unexpected ilias root page redirect
- Remove size suffix for files in content pages - Remove size suffix for files in content pages
- Sanitize ascii control characters on Windows - Sanitize ascii control characters on Windows
- Crawling of paginated past meetings
### Added ### Added
- `no-delete-prompt-override` conflict resolution strategy - `no-delete-prompt-override` conflict resolution strategy

View File

@ -293,7 +293,10 @@ class IliasPage:
return self._uncollapse_future_meetings_url() is not None return self._uncollapse_future_meetings_url() is not None
def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) element = self._soup.find(
"a",
attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)}
)
if not element: if not element:
return None return None
link = self._abs_url_from_link(element) link = self._abs_url_from_link(element)
@ -991,7 +994,11 @@ class IliasPage:
if img_tag is None: if img_tag is None:
img_tag = found_parent.select_one("img.icon") img_tag = found_parent.select_one("img.icon")
if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): is_session_expansion_button = found_parent.find(
"a",
attrs={"href": lambda x: x and ("crs_next_sess=" in x or "crs_prev_sess=" in x)}
)
if img_tag is None and is_session_expansion_button:
log.explain("Found session expansion button, skipping it as it has no content") log.explain("Found session expansion button, skipping it as it has no content")
return None return None