From d204dac8ced63534ca2b4596e9a63c880b2077a3 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Fri, 2 Jun 2023 18:19:39 +0200
Subject: [PATCH] Detect unexpected root page redirects and abort operation

---
 CHANGELOG.md                               |  1 +
 PFERD/crawl/ilias/kit_ilias_html.py        | 10 ++++++++++
 PFERD/crawl/ilias/kit_ilias_web_crawler.py | 20 ++++++++++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 22522e2..ee55659 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ ambiguous situations.
 - Crawling of courses with the timeline view as the default tab
 - Crawling of file and custom opencast cards
 - Crawling of button cards without descriptions
+- Abort crawling when encountering an unexpected ilias root page redirect
 
 ### Added
 - `no-delete-prompt-override` conflict resolution strategy
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index efe6757..aed2069 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -79,6 +79,16 @@ class IliasPage:
         self._page_type = source_element.type if source_element else None
         self._source_name = source_element.name if source_element else ""
 
+    @staticmethod
+    def is_root_page(soup: BeautifulSoup) -> bool:
+        permalink = soup.find(id="current_perma_link")
+        if permalink is None:
+            return False
+        value = permalink.attrs.get("value")
+        if value is None:
+            return False
+        return "goto.php?target=root_" in value
+
     def get_child_elements(self) -> List[IliasPageElement]:
         """
         Return all child page elements you can find here.
diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
index e3719b8..ae49edc 100644
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -239,7 +239,7 @@ instance's greatest bottleneck.
 
                 # Duplicated code, but the root page is special - we want to avoid fetching it twice!
                 while next_stage_url:
-                    soup = await self._get_page(next_stage_url)
+                    soup = await self._get_page(next_stage_url, root_page_allowed=True)
 
                     if current_parent is None and expected_id is not None:
                         perma_link_element: Tag = soup.find(id="current_perma_link")
@@ -739,12 +739,12 @@ instance's greatest bottleneck.
             sink.file.write(content.encode("utf-8"))
             sink.done()
 
-    async def _get_page(self, url: str) -> BeautifulSoup:
+    async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
         auth_id = await self._current_auth_id()
         async with self.session.get(url) as request:
             soup = soupify(await request.read())
             if self._is_logged_in(soup):
-                return soup
+                return self._verify_page(soup, url, root_page_allowed)
 
         # We weren't authenticated, so try to do that
         await self.authenticate(auth_id)
@@ -753,9 +753,21 @@ instance's greatest bottleneck.
         async with self.session.get(url) as request:
             soup = soupify(await request.read())
             if self._is_logged_in(soup):
-                return soup
+                return self._verify_page(soup, url, root_page_allowed)
         raise CrawlError("get_page failed even after authenticating")
 
+    def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
+        if IliasPage.is_root_page(soup) and not root_page_allowed:
+            raise CrawlError(
+                "Unexpectedly encountered ILIAS root page. "
+                "This usually happens because the ILIAS instance is broken. "
+                "If so, wait a day or two and try again. "
+                "It could also happen because a crawled element links to the ILIAS root page. "
+                "If so, use a transform with a ! as target to ignore the particular element. "
+                f"The redirect came from {url}"
+            )
+        return soup
+
     async def _post_authenticated(
         self,
         url: str,