Detect unexpected root page redirects and abort operation

2025-07-20 01:42:37 +02:00 · 2023-06-02 18:19:39 +02:00
parent 443f7fe839
commit d204dac8ce
3 changed files with 27 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ ambiguous situations.
 - Crawling of courses with the timeline view as the default tab
 - Crawling of file and custom opencast cards
 - Crawling of button cards without descriptions
+- Abort crawling when encountering an unexpected ilias root page redirect

 ### Added
 - `no-delete-prompt-override` conflict resolution strategy
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -79,6 +79,16 @@ class IliasPage:
        self._page_type = source_element.type if source_element else None
        self._source_name = source_element.name if source_element else ""

+    @staticmethod
+    def is_root_page(soup: BeautifulSoup) -> bool:
+        permalink = soup.find(id="current_perma_link")
+        if permalink is None:
+            return False
+        value = permalink.attrs.get("value")
+        if value is None:
+            return False
+        return "goto.php?target=root_" in value
+
    def get_child_elements(self) -> List[IliasPageElement]:
        """
        Return all child page elements you can find here.
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -239,7 +239,7 @@ instance's greatest bottleneck.

                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
                while next_stage_url:
-                    soup = await self._get_page(next_stage_url)
+                    soup = await self._get_page(next_stage_url, root_page_allowed=True)

                    if current_parent is None and expected_id is not None:
                        perma_link_element: Tag = soup.find(id="current_perma_link")
@@ -739,12 +739,12 @@ instance's greatest bottleneck.
            sink.file.write(content.encode("utf-8"))
            sink.done()

-    async def _get_page(self, url: str) -> BeautifulSoup:
+    async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
        auth_id = await self._current_auth_id()
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
            if self._is_logged_in(soup):
-                return soup
+                return self._verify_page(soup, url, root_page_allowed)

        # We weren't authenticated, so try to do that
        await self.authenticate(auth_id)
@@ -753,9 +753,21 @@ instance's greatest bottleneck.
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
            if self._is_logged_in(soup):
-                return soup
+                return self._verify_page(soup, url, root_page_allowed)
        raise CrawlError("get_page failed even after authenticating")

+    def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
+        if IliasPage.is_root_page(soup) and not root_page_allowed:
+            raise CrawlError(
+                "Unexpectedly encountered ILIAS root page. "
+                "This usually happens because the ILIAS instance is broken. "
+                "If so, wait a day or two and try again. "
+                "It could also happen because a crawled element links to the ILIAS root page. "
+                "If so, use a transform with a ! as target to ignore the particular element. "
+                f"The redirect came from {url}"
+            )
+        return soup
+
    async def _post_authenticated(
        self,
        url: str,