From f6bdeb6b9db6252681aa7c0468f49bcdd6392697 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Sat, 12 Apr 2025 14:54:58 +0200
Subject: [PATCH] Support ILIAS 9

---
 CHANGELOG.md                           |   3 +
 PFERD/crawl/ilias/ilias_web_crawler.py | 126 +++--
 PFERD/crawl/ilias/kit_ilias_html.py    | 666 ++++++++++++++++---------
 3 files changed, 511 insertions(+), 284 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae82e4f..0625a0e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,9 @@ ambiguous situations.
 
 ## Unreleased
 
+### Added
+- Support for ILIAS 9
+
 ### Changed
 - Added prettier CSS to forum threads
 
diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py
index add49ee..43c8a0b 100644
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -22,7 +22,7 @@ from .async_helper import _iorepeat
 from .file_templates import Links, forum_thread_template, learning_module_template
 from .ilias_html_cleaner import clean, insert_base_markup
 from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
-                             IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
+                             IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
 from .shibboleth_login import ShibbolethLogin
 
 TargetType = Union[str, int]
@@ -105,7 +105,6 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
 
 
 _DIRECTORY_PAGES: Set[IliasElementType] = {
-    IliasElementType.COURSE,
     IliasElementType.EXERCISE,
     IliasElementType.EXERCISE_FILES,
     IliasElementType.FOLDER,
@@ -267,12 +266,12 @@ instance's greatest bottleneck.
                     # If we expect to find a root course, enforce it
                     if current_parent is None and expected_course_id is not None:
                         perma_link = IliasPage.get_soup_permalink(soup)
-                        if not perma_link or "crs_" not in perma_link:
+                        if not perma_link or "crs/" not in perma_link:
                             raise CrawlError("Invalid course id? Didn't find anything looking like a course")
                         if str(expected_course_id) not in perma_link:
                             raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}")
 
-                    page = IliasPage(soup, next_stage_url, current_parent)
+                    page = IliasPage(soup, current_parent)
                     if next_element := page.get_next_stage_element():
                         current_parent = next_element
                         next_stage_url = next_element.url
@@ -362,6 +361,54 @@ instance's greatest bottleneck.
                 "[bright_black](scorm learning modules are not supported)"
             )
             return None
+        elif element.type == IliasElementType.LITERATURE_LIST:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](literature lists are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.LEARNING_MODULE_HTML:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](HTML learning modules are not supported)"
+            )
+            return None
+        elif element.type == IliasElementType.BLOG:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](blogs are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.DCL_RECORD_LIST:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](dcl record lists are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.MEDIA_POOL:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](media pools are not currently supported)"
+            )
+            return None
+        elif element.type == IliasElementType.COURSE:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](not descending into linked course, download it separately)"
+            )
+            return None
         elif element.type == IliasElementType.LEARNING_MODULE:
             return await self._handle_learning_module(element, element_path)
         elif element.type == IliasElementType.LINK:
@@ -590,7 +637,7 @@ instance's greatest bottleneck.
             )
 
         async with dl as (bar, sink):
-            page = IliasPage(await self._get_page(element.url), element.url, element)
+            page = IliasPage(await self._get_page(element.url), element)
             stream_elements = page.get_child_elements()
 
             if len(stream_elements) > 1:
@@ -600,7 +647,7 @@ instance's greatest bottleneck.
                 stream_element = stream_elements[0]
 
                 # We do not have a local cache yet
-                await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
+                await self._stream_from_url(stream_element, sink, bar, is_video=True)
                 add_to_report([str(self._transformer.transform(dl.path))])
                 return
 
@@ -615,7 +662,7 @@ instance's greatest bottleneck.
             async with maybe_dl as (bar, sink):
                 log.explain(f"Streaming video from real url {stream_element.url}")
                 contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
-                await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
+                await self._stream_from_url(stream_element, sink, bar, is_video=True)
 
         add_to_report(contained_video_paths)
 
@@ -637,12 +684,19 @@ instance's greatest bottleneck.
     async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None:
         assert dl  # The function is only reached when dl is not None
         async with dl as (bar, sink):
-            await self._stream_from_url(element.url, sink, bar, is_video)
+            await self._stream_from_url(element, sink, bar, is_video)
+
+    async def _stream_from_url(
+        self,
+        element: IliasPageElement,
+        sink: FileSink,
+        bar: ProgressBar,
+        is_video: bool
+    ) -> None:
+        url = element.url
 
-    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
         async def try_stream() -> bool:
             next_url = url
-
             # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
             # we can not match on the content type here. Instead, we disallow redirects and inspect the
             # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
@@ -690,7 +744,7 @@ instance's greatest bottleneck.
         await self.authenticate(auth_id)
 
         if not await try_stream():
-            raise CrawlError("File streaming failed after authenticate()")
+            raise CrawlError(f"File streaming failed after authenticate() {element!r}")
 
     async def _handle_forum(
         self,
@@ -716,7 +770,7 @@ instance's greatest bottleneck.
                 log.explain(f"URL: {next_stage_url}")
 
                 soup = await self._get_page(next_stage_url)
-                page = IliasPage(soup, next_stage_url, element)
+                page = IliasPage(soup, element)
 
                 if next := page.get_next_stage_element():
                     next_stage_url = next.url
@@ -817,7 +871,7 @@ instance's greatest bottleneck.
             log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
             log.explain(f"URL: {element.url}")
             soup = await self._get_page(element.url)
-            page = IliasPage(soup, element.url, element)
+            page = IliasPage(soup, element)
             if next := page.get_learning_module_data():
                 elements.extend(await self._crawl_learning_module_direction(
                     cl.path, next.previous_url, "left", element
@@ -860,7 +914,7 @@ instance's greatest bottleneck.
             log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
             log.explain(f"URL: {next_element_url}")
             soup = await self._get_page(next_element_url)
-            page = IliasPage(soup, next_element_url, parent_element)
+            page = IliasPage(soup, parent_element)
             if next := page.get_learning_module_data():
                 elements.append(next)
                 if dir == "left":
@@ -891,13 +945,13 @@ instance's greatest bottleneck.
         if prev:
             prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
             if prev_p:
-                prev = os.path.relpath(prev_p, my_path.parent)
+                prev = cast(str, os.path.relpath(prev_p, my_path.parent))
             else:
                 prev = None
         if next:
             next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
             if next_p:
-                next = os.path.relpath(next_p, my_path.parent)
+                next = cast(str, os.path.relpath(next_p, my_path.parent))
             else:
                 next = None
 
@@ -937,10 +991,10 @@ instance's greatest bottleneck.
             )
         self._visited_urls[element.url] = parent_path
 
-    async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
+    async def _get_page(self, url: str, root_page_allowed: bool = False) -> IliasSoup:
         auth_id = await self._current_auth_id()
         async with self.session.get(url) as request:
-            soup = soupify(await request.read())
+            soup = IliasSoup(soupify(await request.read()), str(request.url))
             if IliasPage.is_logged_in(soup):
                 return self._verify_page(soup, url, root_page_allowed)
 
@@ -949,13 +1003,13 @@ instance's greatest bottleneck.
 
         # Retry once after authenticating. If this fails, we will die.
         async with self.session.get(url) as request:
-            soup = soupify(await request.read())
+            soup = IliasSoup(soupify(await request.read()), str(request.url))
             if IliasPage.is_logged_in(soup):
                 return self._verify_page(soup, url, root_page_allowed)
         raise CrawlError(f"get_page failed even after authenticating on {url!r}")
 
     @staticmethod
-    def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
+    def _verify_page(soup: IliasSoup, url: str, root_page_allowed: bool) -> IliasSoup:
         if IliasPage.is_root_page(soup) and not root_page_allowed:
             raise CrawlError(
                 "Unexpectedly encountered ILIAS root page. "
@@ -1037,34 +1091,6 @@ instance's greatest bottleneck.
 
             # do the actual login
             async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request:
-                soup = soupify(await request.read())
-                if not self._is_logged_in(soup):
+                soup = IliasSoup(soupify(await request.read()), str(request.url))
+                if not IliasPage.is_logged_in(soup):
                     self._auth.invalidate_credentials()
-
-    @staticmethod
-    def _is_logged_in(soup: BeautifulSoup) -> bool:
-        # Normal ILIAS pages
-        mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
-        if mainbar is not None:
-            login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x})
-            shib_login = soup.find(id="button_shib_login")
-            return not login_button and not shib_login
-
-        # Personal Desktop
-        if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}):
-            return True
-
-        # Video listing embeds do not have complete ILIAS html. Try to match them by
-        # their video listing table
-        video_table = soup.find(
-            recursive=True,
-            name="table",
-            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
-        )
-        if video_table is not None:
-            return True
-        # The individual video player wrapper page has nothing of the above.
-        # Match it by its playerContainer.
-        if soup.select_one("#playerContainer") is not None:
-            return True
-        return False
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index 963ab05..0f985b2 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -3,20 +3,100 @@ import re
 from dataclasses import dataclass
 from datetime import date, datetime, timedelta
 from enum import Enum
-from typing import Dict, Optional, Union, cast
+from typing import Callable, Dict, Optional, Union, cast
 from urllib.parse import urljoin, urlparse
 
 from bs4 import BeautifulSoup, Tag
 
+from PFERD.crawl import CrawlError
+from PFERD.crawl.crawler import CrawlWarning
 from PFERD.logging import log
 from PFERD.utils import url_set_query_params
 
 TargetType = Union[str, int]
 
 
+class TypeMatcher:
+    class UrlPath:
+        path: str
+
+        def __init__(self, path: str):
+            self.path = path
+
+    class UrlParameter:
+        query: str
+
+        def __init__(self, query: str):
+            self.query = query
+
+    class ImgSrc:
+        src: str
+
+        def __init__(self, src: str):
+            self.src = src
+
+    class ImgAlt:
+        alt: str
+
+        def __init__(self, alt: str):
+            self.alt = alt
+
+    class All:
+        matchers: list['IliasElementMatcher']
+
+        def __init__(self, matchers: list['IliasElementMatcher']):
+            self.matchers = matchers
+
+    class Any:
+        matchers: list['IliasElementMatcher']
+
+        def __init__(self, matchers: list['IliasElementMatcher']):
+            self.matchers = matchers
+
+    @staticmethod
+    def path(path: str) -> UrlPath:
+        return TypeMatcher.UrlPath(path)
+
+    @staticmethod
+    def query(query: str) -> UrlParameter:
+        return TypeMatcher.UrlParameter(query)
+
+    @staticmethod
+    def img_src(src: str) -> ImgSrc:
+        return TypeMatcher.ImgSrc(src)
+
+    @staticmethod
+    def img_alt(alt: str) -> ImgAlt:
+        return TypeMatcher.ImgAlt(alt)
+
+    @staticmethod
+    def all(*matchers: 'IliasElementMatcher') -> All:
+        return TypeMatcher.All(list(matchers))
+
+    @staticmethod
+    def any(*matchers: 'IliasElementMatcher') -> Any:
+        return TypeMatcher.Any(list(matchers))
+
+    @staticmethod
+    def never() -> Any:
+        return TypeMatcher.Any([])
+
+
+IliasElementMatcher = (
+    TypeMatcher.UrlPath
+    | TypeMatcher.UrlParameter
+    | TypeMatcher.ImgSrc
+    | TypeMatcher.ImgAlt
+    | TypeMatcher.All
+    | TypeMatcher.Any
+)
+
+
 class IliasElementType(Enum):
+    BLOG = "blog"
     BOOKING = "booking"
     COURSE = "course"
+    DCL_RECORD_LIST = "dcl_record_list"
     EXERCISE = "exercise"
     EXERCISE_FILES = "exercise_files"  # own submitted files
     FILE = "file"
@@ -25,7 +105,10 @@ class IliasElementType(Enum):
     FORUM_THREAD = "forum_thread"
     INFO_TAB = "info_tab"
     LEARNING_MODULE = "learning_module"
+    LEARNING_MODULE_HTML = "learning_module_html"
+    LITERATURE_LIST = "literature_list"
     LINK = "link"
+    MEDIA_POOL = "media_pool"
     MEDIACAST_VIDEO = "mediacast_video"
     MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
     MEETING = "meeting"
@@ -38,6 +121,131 @@ class IliasElementType(Enum):
     SURVEY = "survey"
     TEST = "test"  # an online test. Will be ignored currently.
 
+    def matcher(self) -> IliasElementMatcher:
+        match self:
+            case IliasElementType.BLOG:
+                return TypeMatcher.any(
+                    TypeMatcher.img_src("_blog.svg")
+                )
+            case IliasElementType.BOOKING:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/book/"),
+                    TypeMatcher.img_src("_book.svg")
+                )
+            case IliasElementType.COURSE:
+                return TypeMatcher.any(TypeMatcher.path("/crs/"), TypeMatcher.img_src("_crsr.svg"))
+            case IliasElementType.DCL_RECORD_LIST:
+                return TypeMatcher.any(
+                    TypeMatcher.img_src("_dcl.svg"),
+                    TypeMatcher.query("cmdclass=ildclrecordlistgui")
+                )
+            case IliasElementType.EXERCISE:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/exc/"),
+                    TypeMatcher.path("_exc_"),
+                    TypeMatcher.img_src("_exc.svg"),
+                )
+            case IliasElementType.EXERCISE_FILES:
+                return TypeMatcher.never()
+            case IliasElementType.FILE:
+                return TypeMatcher.any(
+                    TypeMatcher.query("cmd=sendfile"),
+                    TypeMatcher.path("_file_"),
+                    TypeMatcher.img_src("/filedelivery/"),
+                )
+            case IliasElementType.FOLDER:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/fold/"),
+                    TypeMatcher.img_src("_fold.svg"),
+
+                    TypeMatcher.path("/grp/"),
+                    TypeMatcher.img_src("_grp.svg"),
+
+                    TypeMatcher.path("/copa/"),
+                    TypeMatcher.path("_copa_"),
+                    TypeMatcher.img_src("_copa.svg"),
+
+                    # Not supported right now but warn users
+                    # TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
+                    # TypeMatcher.img_alt("medienpool"),
+                    # TypeMatcher.img_src("_mep.svg"),
+                )
+            case IliasElementType.FORUM:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/frm/"),
+                    TypeMatcher.path("_frm_"),
+                    TypeMatcher.img_src("_frm.svg"),
+                )
+            case IliasElementType.FORUM_THREAD:
+                return TypeMatcher.never()
+            case IliasElementType.INFO_TAB:
+                return TypeMatcher.never()
+            case IliasElementType.LITERATURE_LIST:
+                return TypeMatcher.img_src("_bibl.svg")
+            case IliasElementType.LEARNING_MODULE:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/lm/"),
+                    TypeMatcher.img_src("_lm.svg")
+                )
+            case IliasElementType.LEARNING_MODULE_HTML:
+                return TypeMatcher.any(
+                    TypeMatcher.query("baseclass=ilhtlmpresentationgui"),
+                    TypeMatcher.img_src("_htlm.svg")
+                )
+            case IliasElementType.LINK:
+                return TypeMatcher.any(
+                    TypeMatcher.all(
+                        TypeMatcher.query("baseclass=illinkresourcehandlergui"),
+                        TypeMatcher.query("calldirectlink"),
+                    ),
+                    TypeMatcher.img_src("_webr.svg")
+                )
+            case IliasElementType.MEDIA_POOL:
+                return TypeMatcher.any(
+                    TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
+                    TypeMatcher.img_src("_mep.svg")
+                )
+            case IliasElementType.MEDIACAST_VIDEO:
+                return TypeMatcher.never()
+            case IliasElementType.MEDIACAST_VIDEO_FOLDER:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/mcst/"),
+                    TypeMatcher.query("baseclass=ilmediacasthandlergui"),
+                    TypeMatcher.img_src("_mcst.svg")
+                )
+            case IliasElementType.MEETING:
+                return TypeMatcher.any(
+                    TypeMatcher.img_src("_sess.svg")
+                )
+            case IliasElementType.MOB_VIDEO:
+                return TypeMatcher.never()
+            case IliasElementType.OPENCAST_VIDEO:
+                return TypeMatcher.never()
+            case IliasElementType.OPENCAST_VIDEO_FOLDER:
+                return TypeMatcher.never()
+            case IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED:
+                return TypeMatcher.img_alt("opencast")
+            case IliasElementType.OPENCAST_VIDEO_PLAYER:
+                return TypeMatcher.never()
+            case IliasElementType.SCORM_LEARNING_MODULE:
+                return TypeMatcher.any(
+                    TypeMatcher.query("baseclass=ilsahspresentationgui"),
+                    TypeMatcher.img_src("_sahs.svg")
+                )
+            case IliasElementType.SURVEY:
+                return TypeMatcher.any(
+                    TypeMatcher.path("/svy/"),
+                    TypeMatcher.img_src("svy.svg")
+                )
+            case IliasElementType.TEST:
+                return TypeMatcher.any(
+                    TypeMatcher.query("cmdclass=ilobjtestgui"),
+                    TypeMatcher.query("cmdclass=iltestscreengui"),
+                    TypeMatcher.img_src("_tst.svg")
+                )
+
+        raise CrawlWarning(f"Unknown matcher {self}")
+
 
 @dataclass
 class IliasPageElement:
@@ -50,11 +258,20 @@ class IliasPageElement:
     def id(self) -> str:
         regexes = [
             r"eid=(?P<id>[0-9a-z\-]+)",
-            r"file_(?P<id>\d+)",
-            r"copa_(?P<id>\d+)",
-            r"fold_(?P<id>\d+)",
-            r"frm_(?P<id>\d+)",
-            r"exc_(?P<id>\d+)",
+            r"book/(?P<id>\d+)",  # booking
+            r"cat/(?P<id>\d+)",
+            r"copa/(?P<id>\d+)",  # content page
+            r"crs/(?P<id>\d+)",  # course
+            r"exc/(?P<id>\d+)",  # exercise
+            r"file/(?P<id>\d+)",  # file
+            r"fold/(?P<id>\d+)",  # folder
+            r"frm/(?P<id>\d+)",  # forum
+            r"grp/(?P<id>\d+)",  # group
+            r"lm/(?P<id>\d+)",  # learning module
+            r"mcst/(?P<id>\d+)",  # mediacast
+            r"pg/(?P<id>(\d|_)+)",  # page?
+            r"svy/(?P<id>\d+)",  # survey
+            r"webr/(?P<id>\d+)",  # web referene (link)
             r"thr_pk=(?P<id>\d+)",  # forums
             r"ref_id=(?P<id>\d+)",
             r"target=[a-z]+_(?P<id>\d+)",
@@ -139,18 +356,28 @@ class IliasLearningModulePage:
     previous_url: Optional[str]
 
 
+class IliasSoup:
+    soup: BeautifulSoup
+    page_url: str
+
+    def __init__(self, soup: BeautifulSoup, page_url: str):
+        self.soup = soup
+        self.page_url = page_url
+
+
 class IliasPage:
 
-    def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
-        self._soup = soup
-        self._page_url = _page_url
+    def __init__(self, ilias_soup: IliasSoup, source_element: Optional[IliasPageElement]):
+        self._ilias_soup = ilias_soup
+        self._soup = ilias_soup.soup
+        self._page_url = ilias_soup.page_url
         self._page_type = source_element.type if source_element else None
         self._source_name = source_element.name if source_element else ""
 
     @staticmethod
-    def is_root_page(soup: BeautifulSoup) -> bool:
+    def is_root_page(soup: IliasSoup) -> bool:
         if permalink := IliasPage.get_soup_permalink(soup):
-            return "goto.php?target=root_" in permalink
+            return "goto.php/root/" in permalink
         return False
 
     def get_child_elements(self) -> list[IliasPageElement]:
@@ -193,7 +420,10 @@ class IliasPage:
 
     def get_description(self) -> Optional[BeautifulSoup]:
         def is_interesting_class(name: str) -> bool:
-            return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
+            return name in [
+                "ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap",
+                "ilc_va_ihcap_AccordIHeadCap", "ilc_media_cont_MediaContainer"
+            ]
 
         paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class))
         if not paragraphs:
@@ -206,6 +436,21 @@ class IliasPage:
         for p in paragraphs:
             if p.find_parent(class_=is_interesting_class):
                 continue
+            if "ilc_media_cont_MediaContainer" in p["class"]:
+                # We have an embedded video which should be downloaded by _find_mob_videos
+                if video := p.select_one("video"):
+                    url, title = self._find_mob_video_url_title(video, p)
+                    raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
+                    raw_html += 'display: flex; justify-content: center; align-items: center;'
+                    raw_html += ' margin: 0.5rem;">'
+                    if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
+                        if url.startswith("//"):
+                            url = "https:" + url
+                        raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
+                    else:
+                        raw_html += f"Video elided. Filename: '{title}'."
+                    raw_html += "</div>\n"
+                    continue
 
             # Ignore special listings (like folder groupings)
             if "ilc_section_Special" in p["class"]:
@@ -336,7 +581,7 @@ class IliasPage:
 
     def _is_forum_page(self) -> bool:
         if perma_link := self.get_permalink():
-            return "target=frm_" in perma_link
+            return "/frm/" in perma_link
         return False
 
     def _is_video_player(self) -> bool:
@@ -378,7 +623,7 @@ class IliasPage:
 
     def _is_content_page(self) -> bool:
         if link := self.get_permalink():
-            return "target=copa_" in link
+            return "/copa/" in link
         return False
 
     def _is_learning_module_page(self) -> bool:
@@ -513,19 +758,17 @@ class IliasPage:
                 # Configure button/link does not have anything interesting
                 continue
 
-            type = self._find_type_from_link(name, link, url)
-            if not type:
+            typ = IliasPage._find_type_for_element(
+                name, url, lambda: IliasPage._find_icon_for_folder_entry(link)
+            )
+            if not typ:
                 _unexpected_html_warning()
                 log.warn_contd(f"Could not extract type for {link}")
                 continue
 
-            log.explain(f"Found {name!r}")
+            log.explain(f"Found {name!r} of type {typ}")
 
-            if type == IliasElementType.FILE and "_download" not in url:
-                url = re.sub(r"(target=file_\d+)", r"\1_download", url)
-                log.explain("Rewired file URL to include download part")
-
-            items.append(IliasPageElement.create_new(type, url, name))
+            items.append(IliasPageElement.create_new(typ, url, name))
 
         return items
 
@@ -786,15 +1029,17 @@ class IliasPage:
         for link in links:
             abs_url = self._abs_url_from_link(link)
             # Make sure parents are sanitized. We do not want accidental parents
-            parents = [_sanitize_path_name(x) for x in self._find_upwards_folder_hierarchy(link)]
+            parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
 
             if parents:
                 element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
             else:
                 element_name = _sanitize_path_name(link.get_text())
 
-            element_type = self._find_type_from_link(element_name, link, abs_url)
-            description = self._find_link_description(link)
+            element_type = IliasPage._find_type_for_element(
+                element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
+            )
+            description = IliasPage._find_link_description(link)
 
             # The last meeting on every page is expanded by default.
             # Its content is then shown inline *and* in the meeting page itself.
@@ -805,10 +1050,10 @@ class IliasPage:
             if not element_type:
                 continue
             elif element_type == IliasElementType.FILE:
-                result.append(self._file_to_element(element_name, abs_url, link))
+                result.append(IliasPage._file_to_element(element_name, abs_url, link))
                 continue
 
-            log.explain(f"Found {element_name!r}")
+            log.explain(f"Found {element_name!r} of type {element_type}")
             result.append(IliasPageElement.create_new(
                 element_type,
                 abs_url,
@@ -826,50 +1071,60 @@ class IliasPage:
     def _find_mediacast_videos(self) -> list[IliasPageElement]:
         videos: list[IliasPageElement] = []
 
-        for elem in cast(list[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")):
-            element_name = _sanitize_path_name(
-                cast(Tag, elem.select_one(".ilPlayerPreviewDescription")).get_text().strip()
-            )
-            if not element_name.endswith(".mp4"):
-                # just to make sure it has some kinda-alrightish ending
-                element_name = element_name + ".mp4"
-            video_element = cast(Optional[Tag], elem.find(name="video"))
-            if not video_element:
-                _unexpected_html_warning()
-                log.warn_contd(f"No <video> element found for mediacast video '{element_name}'")
-                continue
+        regex = re.compile(r"il\.VideoPlaylist\.init.+?\[(.+?)], ")
+        for script in cast(list[Tag], self._soup.find_all("script")):
+            for match in regex.finditer(script.text):
+                try:
+                    playlist = json.loads("[" + match.group(1) + "]")
+                except json.JSONDecodeError:
+                    log.warn("Could not decode playlist json")
+                    log.warn_contd(f"Playlist json: [{match.group(1)}]")
+                    continue
+                for elem in playlist:
+                    title = elem.get("title", None)
+                    description = elem.get("description", None)
+                    url = elem.get("resource", None)
+                    if title is None or description is None or url is None:
+                        log.explain(f"Mediacast json: {match.group(1)}")
+                        log.warn("Mediacast video json was not complete")
+                    if title is None:
+                        log.warn_contd("Missing title")
+                    if description is None:
+                        log.warn_contd("Missing description")
+                    if url is None:
+                        log.warn_contd("Missing URL")
 
-            videos.append(IliasPageElement.create_new(
-                typ=IliasElementType.MEDIACAST_VIDEO,
-                url=self._abs_url_from_relative(cast(str, video_element.get("src"))),
-                name=element_name,
-                mtime=self._find_mediacast_video_mtime(cast(Tag, elem.find_parent(name="td")))
-            ))
+                    if not title.endswith(".mp4") and not title.endswith(".webm"):
+                        # just to make sure it has some kinda-alrightish ending
+                        title = title + ".mp4"
+                    videos.append(IliasPageElement.create_new(
+                        typ=IliasElementType.MEDIACAST_VIDEO,
+                        url=self._abs_url_from_relative(cast(str, url)),
+                        name=_sanitize_path_name(title)
+                    ))
 
         return videos
 
     def _find_mob_videos(self) -> list[IliasPageElement]:
         videos: list[IliasPageElement] = []
 
-        for figure in self._soup.select("figure.ilc_media_cont_MediaContainerHighlighted"):
-            title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4"
+        selector = "figure.ilc_media_cont_MediaContainerHighlighted,figure.ilc_media_cont_MediaContainer"
+        for figure in self._soup.select(selector):
             video_element = figure.select_one("video")
             if not video_element:
-                _unexpected_html_warning()
-                log.warn_contd(f"No <video> element found for mob video '{title}'")
                 continue
 
-            url = None
-            for source in video_element.select("source"):
-                if source.get("type", "") == "video/mp4":
-                    url = cast(Optional[str], source.get("src"))
-                    break
+            url, title = self._find_mob_video_url_title(video_element, figure)
 
             if url is None:
                 _unexpected_html_warning()
                 log.warn_contd(f"No <source> element found for mob video '{title}'")
                 continue
 
+            if urlparse(url).hostname != urlparse(self._page_url).hostname:
+                log.explain(f"Found external video at {url}, ignoring")
+                continue
+
             videos.append(IliasPageElement.create_new(
                 typ=IliasElementType.MOB_VIDEO,
                 url=self._abs_url_from_relative(url),
@@ -879,18 +1134,26 @@ class IliasPage:
 
         return videos
 
-    def _find_mediacast_video_mtime(self, enclosing_td: Tag) -> Optional[datetime]:
-        description_td = cast(Tag, enclosing_td.find_previous_sibling("td"))
-        if not description_td:
-            return None
+    def _find_mob_video_url_title(self, video_element: Tag, figure: Tag) -> tuple[Optional[str], str]:
+        url = None
+        for source in video_element.select("source"):
+            if source.get("type", "") == "video/mp4":
+                url = cast(Optional[str], source.get("src"))
+                break
 
-        meta_tag = cast(Optional[Tag], description_td.find_all("p")[-1])
-        if not meta_tag:
-            return None
+        if url is None and video_element.get("src"):
+            url = cast(Optional[str], video_element.get("src"))
 
-        updated_str = meta_tag.get_text().strip().replace("\n", " ")
-        updated_str = re.sub(".+?: ", "", updated_str)
-        return demangle_date(updated_str)
+        fig_caption = cast(Optional[Tag], figure.select_one("figcaption"))
+        if fig_caption:
+            title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4"
+        elif url is not None:
+            path = urlparse(self._abs_url_from_relative(url)).path
+            title = path.rsplit("/", 1)[-1]
+        else:
+            title = f"unknown video {figure}"
+
+        return url, title
 
     def _is_in_expanded_meeting(self, tag: Tag) -> bool:
         """
@@ -907,12 +1170,17 @@ class IliasPage:
             # We should not crawl files under meetings
             if "ilContainerListItemContentCB" in cast(str, parent.get("class")):
                 link: Tag = parent.parent.find("a")  # type: ignore
-                type = IliasPage._find_type_from_folder_like(link, self._page_url)
-                return type == IliasElementType.MEETING
+                typ = IliasPage._find_type_for_element(
+                    "meeting",
+                    self._abs_url_from_link(link),
+                    lambda: IliasPage._find_icon_for_folder_entry(link)
+                )
+                return typ == IliasElementType.MEETING
 
         return False
 
-    def _find_upwards_folder_hierarchy(self, tag: Tag) -> list[str]:
+    @staticmethod
+    def _find_upwards_folder_hierarchy(tag: Tag) -> list[str]:
         """
         Interprets accordions and expandable blocks as virtual folders and returns them
         in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test"
@@ -953,13 +1221,16 @@ class IliasPage:
         if outer_accordion_content:
             accordion_tag = cast(Tag, outer_accordion_content.parent)
             head_tag = cast(Tag, accordion_tag.find(attrs={
-                "class": lambda x: x is not None and "ilc_va_ihead_VAccordIHead" in x
+                "class": lambda x: x is not None and (
+                    "ilc_va_ihead_VAccordIHead" in x or "ilc_va_ihead_AccordIHead" in x
+                )
             }))
             found_titles.append(head_tag.get_text().strip())
 
         return [_sanitize_path_name(x) for x in reversed(found_titles)]
 
-    def _find_link_description(self, link: Tag) -> Optional[str]:
+    @staticmethod
+    def _find_link_description(link: Tag) -> Optional[str]:
         tile = cast(
             Tag,
             link.find_parent("div", {"class": lambda x: x is not None and "il_ContainerListItem" in x})
@@ -974,7 +1245,8 @@ class IliasPage:
             return None
         return description_element.get_text().strip()
 
-    def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement:
+    @staticmethod
+    def _file_to_element(name: str, url: str, link_element: Tag) -> IliasPageElement:
         # Files have a list of properties (type, modification date, size, etc.)
         # In a series of divs.
         # Find the parent containing all those divs, so we can filter our what we need
@@ -1007,27 +1279,38 @@ class IliasPage:
         for title in card_titles:
             url = self._abs_url_from_link(title)
             name = _sanitize_path_name(title.get_text().strip())
-            type = self._find_type_from_card(title)
+            typ = IliasPage._find_type_for_element(
+                name, url, lambda: IliasPage._find_icon_from_card(title)
+            )
 
-            if not type:
+            if not typ:
                 _unexpected_html_warning()
                 log.warn_contd(f"Could not extract type for {title}")
                 continue
 
-            result.append(IliasPageElement.create_new(type, url, name))
+            result.append(IliasPageElement.create_new(typ, url, name))
 
         card_button_tiles: list[Tag] = self._soup.select(".card-title button")
 
         for button in card_button_tiles:
-            regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]")  # type: ignore
-            res = regex.search(str(self._soup))
-            if not res:
+            signal_regex = re.compile("#" + str(button["id"]) + r"[\s\S]*?\.trigger\('(.+?)'")
+            signal_match = signal_regex.search(str(self._soup))
+            if not signal_match:
                 _unexpected_html_warning()
-                log.warn_contd(f"Could not find click handler target for {button}")
+                log.warn_contd(f"Could not find click handler signal for {button}")
                 continue
-            url = self._abs_url_from_relative(res.group(1))
+            signal = signal_match.group(1)
+            open_regex = re.compile(r"\.on\('" + signal + r"[\s\S]*?window.open\(['\"](.+?)['\"]")
+            open_match = open_regex.search(str(self._soup))
+            if not open_match:
+                _unexpected_html_warning()
+                log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
+                continue
+            url = self._abs_url_from_relative(open_match.group(1))
             name = _sanitize_path_name(button.get_text().strip())
-            type = self._find_type_from_card(button)
+            typ = IliasPage._find_type_for_element(
+                name, url, lambda: IliasPage._find_icon_from_card(button)
+            )
             caption_parent = cast(Tag, button.find_parent(
                 "div",
                 attrs={"class": lambda x: x is not None and "caption" in x},
@@ -1038,143 +1321,59 @@ class IliasPage:
             else:
                 description = None
 
-            if not type:
+            if not typ:
                 _unexpected_html_warning()
                 log.warn_contd(f"Could not extract type for {button}")
                 continue
 
-            result.append(IliasPageElement.create_new(type, url, name, description=description))
+            result.append(IliasPageElement.create_new(typ, url, name, description=description))
 
         return result
 
-    def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]:
-        def is_card_root(element: Tag) -> bool:
-            return "il-card" in element["class"] and "thumbnail" in element["class"]
-
-        card_root: Optional[Tag] = None
-
-        # We look for the card root
-        for parent in card_title.parents:
-            if is_card_root(parent):
-                card_root = parent
-                break
-
-        if card_root is None:
-            _unexpected_html_warning()
-            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
-            return None
-
-        icon = cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
-
-        if "opencast" in icon["class"] or "xoct" in icon["class"]:
-            return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED
-        if "exc" in icon["class"]:
-            return IliasElementType.EXERCISE
-        if "grp" in icon["class"]:
-            return IliasElementType.FOLDER
-        if "webr" in icon["class"]:
-            return IliasElementType.LINK
-        if "book" in icon["class"]:
-            return IliasElementType.BOOKING
-        if "crsr" in icon["class"]:
-            return IliasElementType.COURSE
-        if "frm" in icon["class"]:
-            return IliasElementType.FORUM
-        if "sess" in icon["class"]:
-            return IliasElementType.MEETING
-        if "tst" in icon["class"]:
-            return IliasElementType.TEST
-        if "fold" in icon["class"]:
-            return IliasElementType.FOLDER
-        if "copa" in icon["class"]:
-            return IliasElementType.FOLDER
-        if "svy" in icon["class"]:
-            return IliasElementType.SURVEY
-        if "file" in icon["class"]:
-            return IliasElementType.FILE
-        if "mcst" in icon["class"]:
-            return IliasElementType.MEDIACAST_VIDEO_FOLDER
-
-        _unexpected_html_warning()
-        log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
-        return None
-
     @staticmethod
-    def _find_type_from_link(
+    def _find_type_for_element(
         element_name: str,
-        link_element: Tag,
-        url: str
+        url: str,
+        icon_for_element: Callable[[], Optional[Tag]],
     ) -> Optional[IliasElementType]:
         """
         Decides which sub crawler to use for a given top level element.
         """
         parsed_url = urlparse(url)
+        icon = icon_for_element()
 
-        # file URLs contain "target=file"
-        if "target=file_" in parsed_url.query:
-            return IliasElementType.FILE
+        def try_matcher(matcher: IliasElementMatcher) -> bool:
+            match matcher:
+                case TypeMatcher.All(matchers=ms):
+                    return all(try_matcher(m) for m in ms)
+                case TypeMatcher.Any(matchers=ms):
+                    return any(try_matcher(m) for m in ms)
+                case TypeMatcher.ImgAlt(alt=alt):
+                    return icon is not None and alt in str(icon["alt"]).lower()
+                case TypeMatcher.ImgSrc(src=src):
+                    return icon is not None and src in str(icon["src"]).lower()
+                case TypeMatcher.UrlPath(path=path):
+                    return path in parsed_url.path.lower()
+                case TypeMatcher.UrlParameter(query=query):
+                    return query in parsed_url.query.lower()
 
-        if "target=grp_" in parsed_url.query:
-            return IliasElementType.FOLDER
+            raise CrawlError(f"Unknown matcher {matcher}")
 
-        if "target=crs_" in parsed_url.query:
-            return IliasElementType.FOLDER
-
-        if "baseClass=ilExerciseHandlerGUI" in parsed_url.query:
-            return IliasElementType.EXERCISE
-
-        if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query:
-            return IliasElementType.LINK
-
-        if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query:
-            return IliasElementType.FORUM
-
-        if "cmdClass=ilobjtestgui" in parsed_url.query:
-            return IliasElementType.TEST
-
-        if "baseClass=ilLMPresentationGUI" in parsed_url.query:
-            return IliasElementType.LEARNING_MODULE
-
-        if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
-            return IliasElementType.MEDIACAST_VIDEO_FOLDER
-
-        if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
-            return IliasElementType.SCORM_LEARNING_MODULE
-
-        # other universities might have content type specified in URL path
-        if "_file_" in parsed_url.path:
-            return IliasElementType.FILE
-
-        if "_fold_" in parsed_url.path or "_copa_" in parsed_url.path:
-            return IliasElementType.FOLDER
-
-        if "_frm_" in parsed_url.path:
-            return IliasElementType.FORUM
-
-        if "_exc_" in parsed_url.path:
-            return IliasElementType.EXERCISE
-
-        # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
-        # try to guess it from the image.
-
-        # Everything with a ref_id can *probably* be opened to reveal nested things
-        # video groups, directories, exercises, etc
-        if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path:
-            return IliasPage._find_type_from_folder_like(link_element, url)
+        for typ in IliasElementType:
+            if try_matcher(typ.matcher()):
+                return typ
 
         _unexpected_html_warning()
-        log.warn_contd(
-            f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})"
-        )
+        log.warn_contd(f"Tried to figure out element type, but failed for {element_name!r} / {url!r})")
+
+        if "ref_id=" in parsed_url.query.lower() or "goto.php" in parsed_url.path.lower():
+            log.warn_contd("Defaulting to FOLDER as it contains a ref_id/goto")
+            return IliasElementType.FOLDER
+
         return None
 
     @staticmethod
-    def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]:
-        """
-        Try crawling something that looks like a folder.
-        """
-        # pylint: disable=too-many-return-statements
-
+    def _find_icon_for_folder_entry(link_element: Tag) -> Optional[Tag]:
         found_parent: Optional[Tag] = None
 
         # We look for the outer div of our inner link, to find information around it
@@ -1186,7 +1385,9 @@ class IliasPage:
 
         if found_parent is None:
             _unexpected_html_warning()
-            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}")
+            log.warn_contd(
+                f"Tried to figure out element type, but did not find an icon for {link_element!r}"
+            )
             return None
 
         # Find the small descriptive icon to figure out the type
@@ -1203,42 +1404,35 @@ class IliasPage:
             log.explain("Found session expansion button, skipping it as it has no content")
             return None
 
-        if img_tag is None:
-            _unexpected_html_warning()
-            log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
-            return None
+        if img_tag is not None:
+            return img_tag
 
-        if "opencast" in str(img_tag["alt"]).lower():
-            return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED
-
-        if str(img_tag["src"]).endswith("icon_exc.svg"):
-            return IliasElementType.EXERCISE
-
-        if str(img_tag["src"]).endswith("icon_webr.svg"):
-            return IliasElementType.LINK
-
-        if str(img_tag["src"]).endswith("icon_book.svg"):
-            return IliasElementType.BOOKING
-
-        if str(img_tag["src"]).endswith("frm.svg"):
-            return IliasElementType.FORUM
-
-        if str(img_tag["src"]).endswith("sess.svg"):
-            return IliasElementType.MEETING
-
-        if str(img_tag["src"]).endswith("icon_tst.svg"):
-            return IliasElementType.TEST
-
-        if str(img_tag["src"]).endswith("icon_mcst.svg"):
-            return IliasElementType.MEDIACAST_VIDEO_FOLDER
-
-        if str(img_tag["src"]).endswith("icon_sahs.svg"):
-            return IliasElementType.SCORM_LEARNING_MODULE
-
-        return IliasElementType.FOLDER
+        log.explain(f"Tried to figure out element type, but did not find an image for {link_element!r}")
+        return None
 
     @staticmethod
-    def is_logged_in(soup: BeautifulSoup) -> bool:
+    def _find_icon_from_card(card_title: Tag) -> Optional[Tag]:
+        def is_card_root(element: Tag) -> bool:
+            return "il-card" in element["class"] and "thumbnail" in element["class"]
+
+        card_root: Optional[Tag] = None
+
+        # We look for the card root
+        for parent in card_title.parents:
+            if is_card_root(parent):
+                card_root = parent
+                break
+
+        if card_root is None:
+            _unexpected_html_warning()
+            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
+            return None
+
+        return cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
+
+    @staticmethod
+    def is_logged_in(ilias_soup: IliasSoup) -> bool:
+        soup = ilias_soup.soup
         # Normal ILIAS pages
         mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
         if mainbar is not None:
@@ -1285,7 +1479,7 @@ class IliasPage:
         return None
 
     def get_permalink(self) -> Optional[str]:
-        return IliasPage.get_soup_permalink(self._soup)
+        return IliasPage.get_soup_permalink(self._ilias_soup)
 
     def _abs_url_from_link(self, link_tag: Tag) -> str:
         """
@@ -1300,11 +1494,15 @@ class IliasPage:
         return urljoin(self._page_url, relative_url)
 
     @staticmethod
-    def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
-        perma_link_element = cast(Tag, soup.select_one(".il-footer-permanent-url > a"))
-        if not perma_link_element or not perma_link_element.get("href"):
-            return None
-        return cast(Optional[str], perma_link_element.get("href"))
+    def get_soup_permalink(ilias_soup: IliasSoup) -> Optional[str]:
+        scripts = cast(list[Tag], ilias_soup.soup.find_all("script"))
+        pattern = re.compile(r"il\.Footer\.permalink\.copyText\(\"(.+?)\"\)")
+        for script in scripts:
+            if match := pattern.search(script.text):
+                url = match.group(1)
+                url = url.replace(r"\/", "/")
+                return url
+        return None
 
 
 def _unexpected_html_warning() -> None: