From 266812f90ea7b33e2cd195ee6d34dc2ba53c4926 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 16 Nov 2023 10:34:49 +0100 Subject: [PATCH] Move is_logged_in helper to kit_ilias_html --- PFERD/crawl/ilias/kit_ilias_html.py | 28 +++++++++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 35 +++------------------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 2c37816..d23141f 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -1067,6 +1067,34 @@ class IliasPage: rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name + @staticmethod + def is_logged_in(soup: BeautifulSoup) -> bool: + # Normal ILIAS pages + mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") + if mainbar is not None: + login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) + shib_login = soup.find(id="button_shib_login") + return not login_button and not shib_login + + # Personal Desktop + if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + return True + + # Video listing embeds do not have complete ILIAS html. Try to match them by + # their video listing table + video_table = soup.find( + recursive=True, + name="table", + attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} + ) + if video_table is not None: + return True + # The individual video player wrapper page has nothing of the above. + # Match it by its playerContainer. + if soup.select_one("#playerContainer") is not None: + return True + return False + def _abs_url_from_link(self, link_tag: Tag) -> str: """ Create an absolute url from an tag. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index d5f6809..94b7b9e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -894,7 +894,7 @@ instance's greatest bottleneck. auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) - if self._is_logged_in(soup): + if IliasPage.is_logged_in(soup): return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that @@ -903,11 +903,12 @@ instance's greatest bottleneck. # Retry once after authenticating. If this fails, we will die. async with self.session.get(url) as request: soup = soupify(await request.read()) - if self._is_logged_in(soup): + if IliasPage.is_logged_in(soup): return self._verify_page(soup, url, root_page_allowed) raise CrawlError(f"get_page failed even after authenticating on {url!r}") - def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + @staticmethod + def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: if IliasPage.is_root_page(soup) and not root_page_allowed: raise CrawlError( "Unexpectedly encountered ILIAS root page. " @@ -965,34 +966,6 @@ instance's greatest bottleneck. async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) - @ staticmethod - def _is_logged_in(soup: BeautifulSoup) -> bool: - # Normal ILIAS pages - mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") - if mainbar is not None: - login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) - shib_login = soup.find(id="button_shib_login") - return not login_button and not shib_login - - # Personal Desktop - if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): - return True - - # Video listing embeds do not have complete ILIAS html. Try to match them by - # their video listing table - video_table = soup.find( - recursive=True, - name="table", - attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} - ) - if video_table is not None: - return True - # The individual video player wrapper page has nothing of the above. - # Match it by its playerContainer. - if soup.select_one("#playerContainer") is not None: - return True - return False - class KitShibbolethLogin: """