Add compatibility with ILIAS 8

2026-01-31 06:32:24 +01:00 · 2024-04-05 19:06:54 +02:00
parent ab0cb2d956
commit eb0c956d32
3 changed files with 46 additions and 40 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ ambiguous situations.

 ### Fixed
 - Video name deduplication
+- Compatibility with ILIAS 8

 ## 3.5.0 - 2023-09-13

--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -95,13 +95,9 @@ class IliasPage:

    @staticmethod
    def is_root_page(soup: BeautifulSoup) -> bool:
-        permalink = soup.find(id="current_perma_link")
-        if permalink is None:
+        if permalink := IliasPage.get_soup_permalink(soup):
+            return "goto.php?target=root_" in permalink
        return False
-        value = permalink.attrs.get("value")
-        if value is None:
-            return False
-        return "goto.php?target=root_" in value

    def get_child_elements(self) -> List[IliasPageElement]:
        """
@@ -279,16 +275,14 @@ class IliasPage:
        return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})

    def _is_content_page(self) -> bool:
-        link = self._soup.find(id="current_perma_link")
-        if not link:
+        if link := self.get_permalink():
+            return "target=copa_" in link
        return False
-        return "target=copa_" in link.get("value")

    def _is_learning_module_page(self) -> bool:
-        link = self._soup.find(id="current_perma_link")
-        if not link:
+        if link := self.get_permalink():
+            return "target=pg_" in link
        return False
-        return "target=pg_" in link.get("value")

    def _contains_collapsed_future_meetings(self) -> bool:
        return self._uncollapse_future_meetings_url() is not None
@@ -513,8 +507,8 @@ class IliasPage:
            modification_string = link.parent.parent.parent.select_one(
                f"td.std:nth-child({index})"
            ).getText().strip()
-            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
-                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
+            if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
+                modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
                break

        if modification_time is None:
@@ -613,7 +607,7 @@ class IliasPage:
            file_listings: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
-                attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
+                attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
            )

            # Add each listing as a new
@@ -1095,6 +1089,9 @@ class IliasPage:
            return True
        return False

+    def get_permalink(self) -> Optional[str]:
+        return IliasPage.get_soup_permalink(self._soup)
+
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        """
        Create an absolute url from an <a> tag.
@@ -1107,6 +1104,13 @@ class IliasPage:
        """
        return urljoin(self._page_url, relative_url)

+    @staticmethod
+    def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
+        perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
+        if not perma_link_element or not perma_link_element.get("href"):
+            return None
+        return perma_link_element.get("href")
+

 def _unexpected_html_warning() -> None:
    log.warn("Encountered unexpected HTML structure, ignoring element.")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -12,17 +12,17 @@ import yarl
 from aiohttp import hdrs
 from bs4 import BeautifulSoup, Tag

+from .file_templates import Links, learning_module_template
+from .ilias_html_cleaner import clean, insert_base_markup
+from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
+                             IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
+from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
+from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from ...auth import Authenticator, TfaAuthenticator
 from ...config import Config
 from ...logging import ProgressBar, log
 from ...output_dir import FileSink, Redownload
 from ...utils import fmt_path, soupify, url_set_query_param
-from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
-from ..http_crawler import HttpCrawler, HttpCrawlerSection
-from .file_templates import Links, learning_module_template
-from .ilias_html_cleaner import clean, insert_base_markup
-from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
-                             IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)

 TargetType = Union[str, int]

@@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
            raise CrawlError("Impossible return in ilias _iorepeat")

        return wrapper  # type: ignore
+
    return decorator


@@ -253,8 +254,8 @@ instance's greatest bottleneck.
                    soup = await self._get_page(next_stage_url, root_page_allowed=True)

                    if current_parent is None and expected_id is not None:
-                        perma_link_element: Tag = soup.find(id="current_perma_link")
-                        if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+                        perma_link = IliasPage.get_soup_permalink(soup)
+                        if not perma_link or "crs_" not in perma_link:
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")

                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -677,7 +678,7 @@ instance's greatest bottleneck.
            async with self.session.get(url, allow_redirects=is_video) as resp:
                if not is_video:
                    # Redirect means we weren't authenticated
-                    if hdrs.LOCATION in resp.headers:
+                    if hdrs.LOCATION in resp.headers and "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
                        return False
                # we wanted a video but got HTML
                if is_video and "html" in resp.content_type: