From d204dac8ced63534ca2b4596e9a63c880b2077a3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 2 Jun 2023 18:19:39 +0200 Subject: [PATCH] Detect unexpected root page redirects and abort operation --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 10 ++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 20 ++++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22522e2..ee55659 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ ambiguous situations. - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +- Abort crawling when encountering an unexpected ilias root page redirect ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index efe6757..aed2069 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -79,6 +79,16 @@ class IliasPage: self._page_type = source_element.type if source_element else None self._source_name = source_element.name if source_element else "" + @staticmethod + def is_root_page(soup: BeautifulSoup) -> bool: + permalink = soup.find(id="current_perma_link") + if permalink is None: + return False + value = permalink.attrs.get("value") + if value is None: + return False + return "goto.php?target=root_" in value + def get_child_elements(self) -> List[IliasPageElement]: """ Return all child page elements you can find here. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index e3719b8..ae49edc 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -239,7 +239,7 @@ instance's greatest bottleneck. # Duplicated code, but the root page is special - we want to avoid fetching it twice! while next_stage_url: - soup = await self._get_page(next_stage_url) + soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") @@ -739,12 +739,12 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() - async def _get_page(self, url: str) -> BeautifulSoup: + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that await self.authenticate(auth_id) @@ -753,9 +753,21 @@ instance's greatest bottleneck. async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) raise CrawlError("get_page failed even after authenticating") + def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + if IliasPage.is_root_page(soup) and not root_page_allowed: + raise CrawlError( + "Unexpectedly encountered ILIAS root page. " + "This usually happens because the ILIAS instance is broken. " + "If so, wait a day or two and try again. " + "It could also happen because a crawled element links to the ILIAS root page. " + "If so, use a transform with a ! as target to ignore the particular element. " + f"The redirect came from {url}" + ) + return soup + async def _post_authenticated( self, url: str,