mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-03 22:23:41 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			1577 lines
		
	
	
		
			62 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1577 lines
		
	
	
		
			62 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import json
 | 
						|
import re
 | 
						|
from collections.abc import Callable
 | 
						|
from dataclasses import dataclass
 | 
						|
from datetime import date, datetime, timedelta
 | 
						|
from enum import Enum
 | 
						|
from typing import Optional, cast
 | 
						|
from urllib.parse import urljoin, urlparse
 | 
						|
 | 
						|
from bs4 import BeautifulSoup, Tag
 | 
						|
 | 
						|
from PFERD.crawl import CrawlError
 | 
						|
from PFERD.crawl.crawler import CrawlWarning
 | 
						|
from PFERD.logging import log
 | 
						|
from PFERD.utils import sanitize_path_name, url_set_query_params
 | 
						|
 | 
						|
TargetType = str | int
 | 
						|
 | 
						|
 | 
						|
class TypeMatcher:
 | 
						|
    class UrlPath:
 | 
						|
        path: str
 | 
						|
 | 
						|
        def __init__(self, path: str):
 | 
						|
            self.path = path
 | 
						|
 | 
						|
    class UrlParameter:
 | 
						|
        query: str
 | 
						|
 | 
						|
        def __init__(self, query: str):
 | 
						|
            self.query = query
 | 
						|
 | 
						|
    class ImgSrc:
 | 
						|
        src: str
 | 
						|
 | 
						|
        def __init__(self, src: str):
 | 
						|
            self.src = src
 | 
						|
 | 
						|
    class ImgAlt:
 | 
						|
        alt: str
 | 
						|
 | 
						|
        def __init__(self, alt: str):
 | 
						|
            self.alt = alt
 | 
						|
 | 
						|
    class All:
 | 
						|
        matchers: list["IliasElementMatcher"]
 | 
						|
 | 
						|
        def __init__(self, matchers: list["IliasElementMatcher"]):
 | 
						|
            self.matchers = matchers
 | 
						|
 | 
						|
    class Any:
 | 
						|
        matchers: list["IliasElementMatcher"]
 | 
						|
 | 
						|
        def __init__(self, matchers: list["IliasElementMatcher"]):
 | 
						|
            self.matchers = matchers
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def path(path: str) -> UrlPath:
 | 
						|
        return TypeMatcher.UrlPath(path)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def query(query: str) -> UrlParameter:
 | 
						|
        return TypeMatcher.UrlParameter(query)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def img_src(src: str) -> ImgSrc:
 | 
						|
        return TypeMatcher.ImgSrc(src)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def img_alt(alt: str) -> ImgAlt:
 | 
						|
        return TypeMatcher.ImgAlt(alt)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def all(*matchers: "IliasElementMatcher") -> All:
 | 
						|
        return TypeMatcher.All(list(matchers))
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def any(*matchers: "IliasElementMatcher") -> Any:
 | 
						|
        return TypeMatcher.Any(list(matchers))
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def never() -> Any:
 | 
						|
        return TypeMatcher.Any([])
 | 
						|
 | 
						|
 | 
						|
IliasElementMatcher = (
 | 
						|
    TypeMatcher.UrlPath
 | 
						|
    | TypeMatcher.UrlParameter
 | 
						|
    | TypeMatcher.ImgSrc
 | 
						|
    | TypeMatcher.ImgAlt
 | 
						|
    | TypeMatcher.All
 | 
						|
    | TypeMatcher.Any
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
class IliasElementType(Enum):
 | 
						|
    BLOG = "blog"
 | 
						|
    BOOKING = "booking"
 | 
						|
    COURSE = "course"
 | 
						|
    DCL_RECORD_LIST = "dcl_record_list"
 | 
						|
    EXERCISE_OVERVIEW = "exercise_overview"
 | 
						|
    EXERCISE = "exercise"  # own submitted files
 | 
						|
    EXERCISE_FILES = "exercise_files"  # own submitted files
 | 
						|
    FILE = "file"
 | 
						|
    FOLDER = "folder"
 | 
						|
    FORUM = "forum"
 | 
						|
    FORUM_THREAD = "forum_thread"
 | 
						|
    INFO_TAB = "info_tab"
 | 
						|
    LEARNING_MODULE = "learning_module"
 | 
						|
    LEARNING_MODULE_HTML = "learning_module_html"
 | 
						|
    LITERATURE_LIST = "literature_list"
 | 
						|
    LINK = "link"
 | 
						|
    LINK_COLLECTION = "link_collection"
 | 
						|
    MEDIA_POOL = "media_pool"
 | 
						|
    MEDIACAST_VIDEO = "mediacast_video"
 | 
						|
    MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
 | 
						|
    MEETING = "meeting"
 | 
						|
    MOB_VIDEO = "mob_video"
 | 
						|
    OPENCAST_VIDEO = "opencast_video"
 | 
						|
    OPENCAST_VIDEO_FOLDER = "opencast_video_folder"
 | 
						|
    OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated"
 | 
						|
    OPENCAST_VIDEO_PLAYER = "opencast_video_player"
 | 
						|
    SCORM_LEARNING_MODULE = "scorm_learning_module"
 | 
						|
    SURVEY = "survey"
 | 
						|
    TEST = "test"  # an online test. Will be ignored currently.
 | 
						|
    WIKI = "wiki"
 | 
						|
 | 
						|
    def matcher(self) -> IliasElementMatcher:
 | 
						|
        match self:
 | 
						|
            case IliasElementType.BLOG:
 | 
						|
                return TypeMatcher.any(TypeMatcher.img_src("_blog.svg"))
 | 
						|
            case IliasElementType.BOOKING:
 | 
						|
                return TypeMatcher.any(TypeMatcher.path("/book/"), TypeMatcher.img_src("_book.svg"))
 | 
						|
            case IliasElementType.COURSE:
 | 
						|
                return TypeMatcher.any(TypeMatcher.path("/crs/"), TypeMatcher.img_src("_crsr.svg"))
 | 
						|
            case IliasElementType.DCL_RECORD_LIST:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.img_src("_dcl.svg"), TypeMatcher.query("cmdclass=ildclrecordlistgui")
 | 
						|
                )
 | 
						|
            case IliasElementType.EXERCISE:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.EXERCISE_FILES:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.EXERCISE_OVERVIEW:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.path("/exc/"),
 | 
						|
                    TypeMatcher.path("_exc_"),
 | 
						|
                    TypeMatcher.img_src("_exc.svg"),
 | 
						|
                )
 | 
						|
            case IliasElementType.FILE:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("cmd=sendfile"),
 | 
						|
                    TypeMatcher.path("_file_"),
 | 
						|
                    TypeMatcher.img_src("/filedelivery/"),
 | 
						|
                )
 | 
						|
            case IliasElementType.FOLDER:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.path("/fold/"),
 | 
						|
                    TypeMatcher.img_src("_fold.svg"),
 | 
						|
                    TypeMatcher.path("/grp/"),
 | 
						|
                    TypeMatcher.img_src("_grp.svg"),
 | 
						|
                    TypeMatcher.path("/copa/"),
 | 
						|
                    TypeMatcher.path("_copa_"),
 | 
						|
                    TypeMatcher.img_src("_copa.svg"),
 | 
						|
                    # Not supported right now but warn users
 | 
						|
                    # TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
 | 
						|
                    # TypeMatcher.img_alt("medienpool"),
 | 
						|
                    # TypeMatcher.img_src("_mep.svg"),
 | 
						|
                )
 | 
						|
            case IliasElementType.FORUM:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.path("/frm/"),
 | 
						|
                    TypeMatcher.path("_frm_"),
 | 
						|
                    TypeMatcher.img_src("_frm.svg"),
 | 
						|
                )
 | 
						|
            case IliasElementType.FORUM_THREAD:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.INFO_TAB:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.LITERATURE_LIST:
 | 
						|
                return TypeMatcher.img_src("_bibl.svg")
 | 
						|
            case IliasElementType.LEARNING_MODULE:
 | 
						|
                return TypeMatcher.any(TypeMatcher.path("/lm/"), TypeMatcher.img_src("_lm.svg"))
 | 
						|
            case IliasElementType.LEARNING_MODULE_HTML:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("baseclass=ilhtlmpresentationgui"), TypeMatcher.img_src("_htlm.svg")
 | 
						|
                )
 | 
						|
            case IliasElementType.LINK:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.all(
 | 
						|
                        TypeMatcher.query("baseclass=illinkresourcehandlergui"),
 | 
						|
                        TypeMatcher.query("calldirectlink"),
 | 
						|
                    ),
 | 
						|
                    TypeMatcher.img_src("_webr.svg"),  # duplicated :(
 | 
						|
                )
 | 
						|
            case IliasElementType.LINK_COLLECTION:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("baseclass=illinkresourcehandlergui"),
 | 
						|
                    TypeMatcher.img_src("_webr.svg"),  # duplicated :(
 | 
						|
                )
 | 
						|
            case IliasElementType.MEDIA_POOL:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("baseclass=ilmediapoolpresentationgui"), TypeMatcher.img_src("_mep.svg")
 | 
						|
                )
 | 
						|
            case IliasElementType.MEDIACAST_VIDEO:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.MEDIACAST_VIDEO_FOLDER:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.path("/mcst/"),
 | 
						|
                    TypeMatcher.query("baseclass=ilmediacasthandlergui"),
 | 
						|
                    TypeMatcher.img_src("_mcst.svg"),
 | 
						|
                )
 | 
						|
            case IliasElementType.MEETING:
 | 
						|
                return TypeMatcher.any(TypeMatcher.img_src("_sess.svg"))
 | 
						|
            case IliasElementType.MOB_VIDEO:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.OPENCAST_VIDEO:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.OPENCAST_VIDEO_FOLDER:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED:
 | 
						|
                return TypeMatcher.img_alt("opencast")
 | 
						|
            case IliasElementType.OPENCAST_VIDEO_PLAYER:
 | 
						|
                return TypeMatcher.never()
 | 
						|
            case IliasElementType.SCORM_LEARNING_MODULE:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("baseclass=ilsahspresentationgui"), TypeMatcher.img_src("_sahs.svg")
 | 
						|
                )
 | 
						|
            case IliasElementType.SURVEY:
 | 
						|
                return TypeMatcher.any(TypeMatcher.path("/svy/"), TypeMatcher.img_src("svy.svg"))
 | 
						|
            case IliasElementType.TEST:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("cmdclass=ilobjtestgui"),
 | 
						|
                    TypeMatcher.query("cmdclass=iltestscreengui"),
 | 
						|
                    TypeMatcher.img_src("_tst.svg"),
 | 
						|
                )
 | 
						|
            case IliasElementType.WIKI:
 | 
						|
                return TypeMatcher.any(
 | 
						|
                    TypeMatcher.query("baseClass=ilwikihandlergui"), TypeMatcher.img_src("wiki.svg")
 | 
						|
                )
 | 
						|
 | 
						|
        raise CrawlWarning(f"Unknown matcher {self}")
 | 
						|
 | 
						|
 | 
						|
@dataclass
 | 
						|
class IliasPageElement:
 | 
						|
    type: IliasElementType
 | 
						|
    url: str
 | 
						|
    name: str
 | 
						|
    mtime: Optional[datetime] = None
 | 
						|
    description: Optional[str] = None
 | 
						|
 | 
						|
    def id(self) -> str:
 | 
						|
        regexes = [
 | 
						|
            r"eid=(?P<id>[0-9a-z\-]+)",
 | 
						|
            r"book/(?P<id>\d+)",  # booking
 | 
						|
            r"cat/(?P<id>\d+)",
 | 
						|
            r"copa/(?P<id>\d+)",  # content page
 | 
						|
            r"crs/(?P<id>\d+)",  # course
 | 
						|
            r"exc/(?P<id>\d+)",  # exercise
 | 
						|
            r"file/(?P<id>\d+)",  # file
 | 
						|
            r"fold/(?P<id>\d+)",  # folder
 | 
						|
            r"frm/(?P<id>\d+)",  # forum
 | 
						|
            r"grp/(?P<id>\d+)",  # group
 | 
						|
            r"lm/(?P<id>\d+)",  # learning module
 | 
						|
            r"mcst/(?P<id>\d+)",  # mediacast
 | 
						|
            r"pg/(?P<id>(\d|_)+)",  # page?
 | 
						|
            r"svy/(?P<id>\d+)",  # survey
 | 
						|
            r"sess/(?P<id>\d+)",  # session
 | 
						|
            r"webr/(?P<id>\d+)",  # web referene (link)
 | 
						|
            r"thr_pk=(?P<id>\d+)",  # forums
 | 
						|
            r"ref_id=(?P<id>\d+)",
 | 
						|
            r"target=[a-z]+_(?P<id>\d+)",
 | 
						|
            r"mm_(?P<id>\d+)",
 | 
						|
        ]
 | 
						|
 | 
						|
        for regex in regexes:
 | 
						|
            if match := re.search(regex, self.url):
 | 
						|
                return match.groupdict()["id"]
 | 
						|
 | 
						|
        # Fall back to URL
 | 
						|
        log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.")
 | 
						|
        return self.url
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def create_new(
 | 
						|
        typ: IliasElementType,
 | 
						|
        url: str,
 | 
						|
        name: str,
 | 
						|
        mtime: Optional[datetime] = None,
 | 
						|
        description: Optional[str] = None,
 | 
						|
        skip_sanitize: bool = False,
 | 
						|
    ) -> "IliasPageElement":
 | 
						|
        if typ == IliasElementType.MEETING:
 | 
						|
            normalized = IliasPageElement._normalize_meeting_name(name)
 | 
						|
            log.explain(f"Normalized meeting name from {name!r} to {normalized!r}")
 | 
						|
            name = normalized
 | 
						|
 | 
						|
        if not skip_sanitize:
 | 
						|
            name = sanitize_path_name(name)
 | 
						|
 | 
						|
        return IliasPageElement(typ, url, name, mtime, description)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _normalize_meeting_name(meeting_name: str) -> str:
 | 
						|
        """
 | 
						|
        Normalizes meeting names, which have a relative time as their first part,
 | 
						|
        to their date in ISO format.
 | 
						|
        """
 | 
						|
 | 
						|
        # This checks whether we can reach a `:` without passing a `-`
 | 
						|
        if re.search(r"^[^-]+: ", meeting_name):  # noqa: SIM108
 | 
						|
            # Meeting name only contains date: "05. Jan 2000:"
 | 
						|
            split_delimiter = ":"
 | 
						|
        else:
 | 
						|
            # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:"
 | 
						|
            split_delimiter = ", "
 | 
						|
 | 
						|
        # We have a meeting day without time
 | 
						|
        date_portion_str = meeting_name.split(split_delimiter)[0]
 | 
						|
        date_portion = demangle_date(date_portion_str)
 | 
						|
 | 
						|
        # We failed to parse the date, bail out
 | 
						|
        if not date_portion:
 | 
						|
            return meeting_name
 | 
						|
 | 
						|
        # Replace the first section with the absolute date
 | 
						|
        rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:])
 | 
						|
        return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name
 | 
						|
 | 
						|
 | 
						|
@dataclass
 | 
						|
class IliasDownloadForumData:
 | 
						|
    url: str
 | 
						|
    form_data: dict[str, str | list[str]]
 | 
						|
    empty: bool
 | 
						|
 | 
						|
 | 
						|
@dataclass
 | 
						|
class IliasForumThread:
 | 
						|
    name: str
 | 
						|
    name_tag: Tag
 | 
						|
    content_tag: Tag
 | 
						|
    mtime: Optional[datetime]
 | 
						|
 | 
						|
 | 
						|
@dataclass
 | 
						|
class IliasLearningModulePage:
 | 
						|
    title: str
 | 
						|
    content: Tag
 | 
						|
    next_url: Optional[str]
 | 
						|
    previous_url: Optional[str]
 | 
						|
 | 
						|
 | 
						|
class IliasSoup:
 | 
						|
    soup: BeautifulSoup
 | 
						|
    page_url: str
 | 
						|
 | 
						|
    def __init__(self, soup: BeautifulSoup, page_url: str):
 | 
						|
        self.soup = soup
 | 
						|
        self.page_url = page_url
 | 
						|
 | 
						|
 | 
						|
class IliasPage:
 | 
						|
    def __init__(self, ilias_soup: IliasSoup, source_element: Optional[IliasPageElement]):
 | 
						|
        self._ilias_soup = ilias_soup
 | 
						|
        self._soup = ilias_soup.soup
 | 
						|
        self._page_url = ilias_soup.page_url
 | 
						|
        self._page_type = source_element.type if source_element else None
 | 
						|
        self._source_name = source_element.name if source_element else ""
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def is_root_page(soup: IliasSoup) -> bool:
 | 
						|
        if permalink := IliasPage.get_soup_permalink(soup):
 | 
						|
            return "goto.php/root/" in permalink
 | 
						|
        return False
 | 
						|
 | 
						|
    def get_child_elements(self) -> list[IliasPageElement]:
 | 
						|
        """
 | 
						|
        Return all child page elements you can find here.
 | 
						|
        """
 | 
						|
        if self._is_video_player():
 | 
						|
            log.explain("Page is a video player, extracting URL")
 | 
						|
            return self._player_to_video()
 | 
						|
        if self._is_opencast_video_listing():
 | 
						|
            log.explain("Page is an opencast video listing, searching for elements")
 | 
						|
            return self._find_opencast_video_entries()
 | 
						|
        if self._is_exercise_file():
 | 
						|
            log.explain("Page is an exercise, searching for elements")
 | 
						|
            return self._find_exercise_entries()
 | 
						|
        if self._is_personal_desktop():
 | 
						|
            log.explain("Page is the personal desktop, searching for elements")
 | 
						|
            return self._find_personal_desktop_entries()
 | 
						|
        if self._is_content_page():
 | 
						|
            log.explain("Page is a content page, searching for elements")
 | 
						|
            return self._find_copa_entries()
 | 
						|
        if self._is_info_tab():
 | 
						|
            log.explain("Page is info tab, searching for elements")
 | 
						|
            return self._find_info_tab_entries()
 | 
						|
        log.explain("Page is a normal folder, searching for elements")
 | 
						|
        return self._find_normal_entries()
 | 
						|
 | 
						|
    def get_info_tab(self) -> Optional[IliasPageElement]:
 | 
						|
        tab: Optional[Tag] = self._soup.find(
 | 
						|
            name="a", attrs={"href": lambda x: x is not None and "cmdClass=ilinfoscreengui" in x}
 | 
						|
        )
 | 
						|
        if tab is not None:
 | 
						|
            return IliasPageElement.create_new(
 | 
						|
                IliasElementType.INFO_TAB, self._abs_url_from_link(tab), "infos"
 | 
						|
            )
 | 
						|
        return None
 | 
						|
 | 
						|
    def get_description(self) -> Optional[BeautifulSoup]:
 | 
						|
        def is_interesting_class(name: str | None) -> bool:
 | 
						|
            return name in [
 | 
						|
                "ilCOPageSection",
 | 
						|
                "ilc_Paragraph",
 | 
						|
                "ilc_va_ihcap_VAccordIHeadCap",
 | 
						|
                "ilc_va_ihcap_AccordIHeadCap",
 | 
						|
                "ilc_media_cont_MediaContainer",
 | 
						|
            ]
 | 
						|
 | 
						|
        paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class))
 | 
						|
        if not paragraphs:
 | 
						|
            return None
 | 
						|
 | 
						|
        # Extract bits and pieces into a string and parse it again.
 | 
						|
        # This ensures we don't miss anything and weird structures are resolved
 | 
						|
        # somewhat gracefully.
 | 
						|
        raw_html = ""
 | 
						|
        for p in paragraphs:
 | 
						|
            if p.find_parent(class_=is_interesting_class):
 | 
						|
                continue
 | 
						|
            if "ilc_media_cont_MediaContainer" in p["class"] and (video := p.select_one("video")):
 | 
						|
                # We have an embedded video which should be downloaded by _find_mob_videos
 | 
						|
                url, title = self._find_mob_video_url_title(video, p)
 | 
						|
                raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
 | 
						|
                raw_html += "display: flex; justify-content: center; align-items: center;"
 | 
						|
                raw_html += ' margin: 0.5rem;">'
 | 
						|
                if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
 | 
						|
                    if url.startswith("//"):
 | 
						|
                        url = "https:" + url
 | 
						|
                    raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
 | 
						|
                else:
 | 
						|
                    raw_html += f"Video elided. Filename: '{title}'."
 | 
						|
                raw_html += "</div>\n"
 | 
						|
                continue
 | 
						|
 | 
						|
            # Ignore special listings (like folder groupings)
 | 
						|
            if "ilc_section_Special" in p["class"]:
 | 
						|
                continue
 | 
						|
 | 
						|
            raw_html += str(p) + "\n"
 | 
						|
        raw_html = f"<body>\n{raw_html}\n</body>"
 | 
						|
 | 
						|
        return BeautifulSoup(raw_html, "html.parser")
 | 
						|
 | 
						|
    def get_learning_module_data(self) -> Optional[IliasLearningModulePage]:
 | 
						|
        if not self._is_learning_module_page():
 | 
						|
            return None
 | 
						|
        content = cast(Tag, self._soup.select_one("#ilLMPageContent"))
 | 
						|
        title = cast(Tag, self._soup.select_one(".ilc_page_title_PageTitle")).get_text().strip()
 | 
						|
        return IliasLearningModulePage(
 | 
						|
            title=title,
 | 
						|
            content=content,
 | 
						|
            next_url=self._find_learning_module_next(),
 | 
						|
            previous_url=self._find_learning_module_prev(),
 | 
						|
        )
 | 
						|
 | 
						|
    def _find_learning_module_next(self) -> Optional[str]:
 | 
						|
        for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"):
 | 
						|
            url = self._abs_url_from_link(link)
 | 
						|
            if "baseClass=ilLMPresentationGUI" not in url:
 | 
						|
                continue
 | 
						|
            return url
 | 
						|
        return None
 | 
						|
 | 
						|
    def _find_learning_module_prev(self) -> Optional[str]:
 | 
						|
        for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"):
 | 
						|
            url = self._abs_url_from_link(link)
 | 
						|
            if "baseClass=ilLMPresentationGUI" not in url:
 | 
						|
                continue
 | 
						|
            return url
 | 
						|
        return None
 | 
						|
 | 
						|
    def get_forum_export_url(self) -> Optional[str]:
 | 
						|
        forum_link = self._soup.select_one("#tab_forums_threads > a")
 | 
						|
        if not forum_link:
 | 
						|
            log.explain("Found no forum link")
 | 
						|
            return None
 | 
						|
 | 
						|
        base_url = self._abs_url_from_link(forum_link)
 | 
						|
        base_url = re.sub(r"cmd=\w+", "cmd=post", base_url)
 | 
						|
        base_url = re.sub(r"cmdClass=\w+", "cmdClass=ilExportGUI", base_url)
 | 
						|
 | 
						|
        rtoken_form = self._soup.find("form", attrs={"action": lambda x: x is not None and "rtoken=" in x})
 | 
						|
        if not rtoken_form:
 | 
						|
            log.explain("Found no rtoken anywhere")
 | 
						|
            return None
 | 
						|
        match = cast(re.Match[str], re.search(r"rtoken=(\w+)", str(rtoken_form.attrs["action"])))
 | 
						|
        rtoken = match.group(1)
 | 
						|
 | 
						|
        base_url = base_url + "&rtoken=" + rtoken
 | 
						|
 | 
						|
        return base_url
 | 
						|
 | 
						|
    def get_next_stage_element(self) -> Optional[IliasPageElement]:
 | 
						|
        if self._is_ilias_opencast_embedding():
 | 
						|
            log.explain("Unwrapping opencast embedding")
 | 
						|
            return self.get_child_elements()[0]
 | 
						|
        if self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED:
 | 
						|
            log.explain("Unwrapping video pagination")
 | 
						|
            return self._find_opencast_video_entries_paginated()[0]
 | 
						|
        if self._contains_collapsed_future_meetings():
 | 
						|
            log.explain("Requesting *all* future meetings")
 | 
						|
            return self._uncollapse_future_meetings_url()
 | 
						|
        if self._is_exercise_not_all_shown():
 | 
						|
            return self._show_all_exercises()
 | 
						|
        if not self._is_content_tab_selected():
 | 
						|
            if self._page_type != IliasElementType.INFO_TAB:
 | 
						|
                log.explain("Selecting content tab")
 | 
						|
                return self._select_content_page_url()
 | 
						|
            else:
 | 
						|
                log.explain("Crawling info tab, skipping content select")
 | 
						|
        return None
 | 
						|
 | 
						|
    def _is_video_player(self) -> bool:
 | 
						|
        return "paella_config_file" in str(self._soup)
 | 
						|
 | 
						|
    def _is_opencast_video_listing(self) -> bool:
 | 
						|
        if self._is_ilias_opencast_embedding():
 | 
						|
            return True
 | 
						|
 | 
						|
        # Raw listing without ILIAS fluff
 | 
						|
        video_element_table = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+"))
 | 
						|
        return video_element_table is not None
 | 
						|
 | 
						|
    def _is_ilias_opencast_embedding(self) -> bool:
 | 
						|
        # ILIAS fluff around the real opencast html
 | 
						|
        if self._soup.find(id="headerimage"):
 | 
						|
            element: Tag = cast(Tag, self._soup.find(id="headerimage"))
 | 
						|
            if "opencast" in cast(str, element.attrs["src"]).lower():
 | 
						|
                return True
 | 
						|
        return False
 | 
						|
 | 
						|
    def _is_exercise_file(self) -> bool:
 | 
						|
        # we know it from before
 | 
						|
        if self._page_type == IliasElementType.EXERCISE_OVERVIEW:
 | 
						|
            return True
 | 
						|
 | 
						|
        # We have no suitable parent - let's guesss
 | 
						|
        if self._soup.find(id="headerimage"):
 | 
						|
            element: Tag = cast(Tag, self._soup.find(id="headerimage"))
 | 
						|
            if "exc" in cast(str, element.attrs["src"]).lower():
 | 
						|
                return True
 | 
						|
 | 
						|
        return False
 | 
						|
 | 
						|
    def _is_personal_desktop(self) -> bool:
 | 
						|
        return "baseclass=ildashboardgui" in self._page_url.lower() and "&cmd=show" in self._page_url.lower()
 | 
						|
 | 
						|
    def _is_content_page(self) -> bool:
 | 
						|
        if link := self.get_permalink():
 | 
						|
            return "/copa/" in link
 | 
						|
        return False
 | 
						|
 | 
						|
    def _is_learning_module_page(self) -> bool:
 | 
						|
        if link := self.get_permalink():
 | 
						|
            return "target=pg_" in link
 | 
						|
        return False
 | 
						|
 | 
						|
    def _contains_collapsed_future_meetings(self) -> bool:
 | 
						|
        return self._uncollapse_future_meetings_url() is not None
 | 
						|
 | 
						|
    def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
 | 
						|
        element = self._soup.find(
 | 
						|
            "a",
 | 
						|
            attrs={"href": lambda x: x is not None and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)},
 | 
						|
        )
 | 
						|
        if not element:
 | 
						|
            return None
 | 
						|
        link = self._abs_url_from_link(element)
 | 
						|
        return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings")
 | 
						|
 | 
						|
    def _is_exercise_not_all_shown(self) -> bool:
 | 
						|
        return (
 | 
						|
            self._page_type == IliasElementType.EXERCISE_OVERVIEW and "mode=all" not in self._page_url.lower()
 | 
						|
        )
 | 
						|
 | 
						|
    def _show_all_exercises(self) -> Optional[IliasPageElement]:
 | 
						|
        return IliasPageElement.create_new(
 | 
						|
            IliasElementType.EXERCISE_OVERVIEW, self._page_url + "&mode=all", "show all exercises"
 | 
						|
        )
 | 
						|
 | 
						|
    def _is_content_tab_selected(self) -> bool:
 | 
						|
        return self._select_content_page_url() is None
 | 
						|
 | 
						|
    def _is_info_tab(self) -> bool:
 | 
						|
        might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None
 | 
						|
        return self._page_type == IliasElementType.INFO_TAB and might_be_info
 | 
						|
 | 
						|
    def _is_course_overview_page(self) -> bool:
 | 
						|
        return "baseClass=ilmembershipoverviewgui" in self._page_url
 | 
						|
 | 
						|
    def _select_content_page_url(self) -> Optional[IliasPageElement]:
 | 
						|
        tab = self._soup.find(
 | 
						|
            id="tab_view_content", attrs={"class": lambda x: x is not None and "active" not in x}
 | 
						|
        )
 | 
						|
        # Already selected (or not found)
 | 
						|
        if not tab:
 | 
						|
            return None
 | 
						|
        link = tab.find("a")
 | 
						|
        if link:
 | 
						|
            link_str = self._abs_url_from_link(link)
 | 
						|
            return IliasPageElement.create_new(IliasElementType.FOLDER, link_str, "select content page")
 | 
						|
 | 
						|
        _unexpected_html_warning()
 | 
						|
        log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.")
 | 
						|
        log.warn_contd("PFERD might not find content on the course's main page.")
 | 
						|
        return None
 | 
						|
 | 
						|
    def _player_to_video(self) -> list[IliasPageElement]:
 | 
						|
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
 | 
						|
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
 | 
						|
        # on the page, but defined in a JS object inside a script tag, passed to the player
 | 
						|
        # library.
 | 
						|
        # We do the impossible and RegEx the stream JSON object out of the page's HTML source
 | 
						|
        regex = re.compile(r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE)
 | 
						|
        json_match = regex.search(str(self._soup))
 | 
						|
 | 
						|
        if json_match is None:
 | 
						|
            log.warn("Could not find JSON stream info in video player. Ignoring video.")
 | 
						|
            return []
 | 
						|
        json_str = json_match.group(1)
 | 
						|
 | 
						|
        # parse it
 | 
						|
        json_object = json.loads(json_str)
 | 
						|
        streams = [stream for stream in json_object["streams"]]
 | 
						|
 | 
						|
        # and just fetch the lone video url!
 | 
						|
        if len(streams) == 1:
 | 
						|
            video_url = streams[0]["sources"]["mp4"][0]["src"]
 | 
						|
            return [
 | 
						|
                IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)
 | 
						|
            ]
 | 
						|
 | 
						|
        log.explain(f"Found multiple videos for stream at {self._source_name}")
 | 
						|
        items = []
 | 
						|
        for stream in sorted(streams, key=lambda stream: stream["content"]):
 | 
						|
            full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4"
 | 
						|
            video_url = stream["sources"]["mp4"][0]["src"]
 | 
						|
            items.append(IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO, video_url, full_name))
 | 
						|
 | 
						|
        return items
 | 
						|
 | 
						|
    def _get_show_max_forum_entries_per_page_url(
 | 
						|
        self, wanted_max: Optional[int] = None
 | 
						|
    ) -> Optional[IliasPageElement]:
 | 
						|
        correct_link = self._soup.find(
 | 
						|
            "a", attrs={"href": lambda x: x is not None and "trows=800" in x and "cmd=showThreads" in x}
 | 
						|
        )
 | 
						|
 | 
						|
        if not correct_link:
 | 
						|
            return None
 | 
						|
 | 
						|
        link = self._abs_url_from_link(correct_link)
 | 
						|
        if wanted_max is not None:
 | 
						|
            link = link.replace("trows=800", f"trows={wanted_max}")
 | 
						|
 | 
						|
        return IliasPageElement.create_new(IliasElementType.FORUM, link, "show all forum threads")
 | 
						|
 | 
						|
    def _get_forum_thread_count(self) -> Optional[int]:
 | 
						|
        log.explain_topic("Trying to find forum thread count")
 | 
						|
 | 
						|
        candidates = cast(list[Tag], self._soup.select(".ilTableFootLight"))
 | 
						|
        extract_regex = re.compile(r"\s(?P<max>\d+)\s*\)")
 | 
						|
 | 
						|
        for candidate in candidates:
 | 
						|
            log.explain(f"Found thread count candidate: {candidate}")
 | 
						|
            if match := extract_regex.search(candidate.get_text()):
 | 
						|
                return int(match.group("max"))
 | 
						|
        else:
 | 
						|
            log.explain("Found no candidates to extract thread count from")
 | 
						|
 | 
						|
        return None
 | 
						|
 | 
						|
    def _find_personal_desktop_entries(self) -> list[IliasPageElement]:
 | 
						|
        items: list[IliasPageElement] = []
 | 
						|
 | 
						|
        titles: list[Tag] = self._soup.select("#block_pditems_0 .il-item-title")
 | 
						|
        for title in titles:
 | 
						|
            link = title.find("a")
 | 
						|
 | 
						|
            if not link:
 | 
						|
                log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
 | 
						|
                continue
 | 
						|
 | 
						|
            name = sanitize_path_name(link.text.strip())
 | 
						|
            url = self._abs_url_from_link(link)
 | 
						|
 | 
						|
            if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
 | 
						|
                # Configure button/link does not have anything interesting
 | 
						|
                continue
 | 
						|
 | 
						|
            typ = IliasPage._find_type_for_element(
 | 
						|
                name, url, lambda: IliasPage._find_icon_for_folder_entry(cast(Tag, link))
 | 
						|
            )
 | 
						|
            if not typ:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"Could not extract type for {link}")
 | 
						|
                continue
 | 
						|
 | 
						|
            log.explain(f"Found {name!r} of type {typ}")
 | 
						|
 | 
						|
            items.append(IliasPageElement.create_new(typ, url, name))
 | 
						|
 | 
						|
        return items
 | 
						|
 | 
						|
    def _find_copa_entries(self) -> list[IliasPageElement]:
 | 
						|
        items: list[IliasPageElement] = []
 | 
						|
        links: list[Tag] = cast(list[Tag], self._soup.find_all(class_="ilc_flist_a_FileListItemLink"))
 | 
						|
 | 
						|
        for link in links:
 | 
						|
            url = self._abs_url_from_link(link)
 | 
						|
            name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
 | 
						|
            name = sanitize_path_name(name)
 | 
						|
 | 
						|
            if "file_id" not in url:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}")
 | 
						|
                continue
 | 
						|
 | 
						|
            items.append(IliasPageElement.create_new(IliasElementType.FILE, url, name))
 | 
						|
 | 
						|
        return items
 | 
						|
 | 
						|
    def _find_info_tab_entries(self) -> list[IliasPageElement]:
 | 
						|
        items = []
 | 
						|
        links: list[Tag] = self._soup.select("a.il_ContainerItemCommand")
 | 
						|
 | 
						|
        for link in links:
 | 
						|
            if "cmdClass=ilobjcoursegui" not in link["href"]:
 | 
						|
                continue
 | 
						|
            if "cmd=sendfile" not in link["href"]:
 | 
						|
                continue
 | 
						|
            items.append(
 | 
						|
                IliasPageElement.create_new(
 | 
						|
                    IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text())
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        return items
 | 
						|
 | 
						|
    def _find_opencast_video_entries(self) -> list[IliasPageElement]:
 | 
						|
        # ILIAS has three stages for video pages
 | 
						|
        # 1. The initial dummy page without any videos. This page contains the link to the listing
 | 
						|
        # 2. The video listing which might be paginated
 | 
						|
        # 3. An unpaginated video listing (or at least one that includes 800 videos)
 | 
						|
        #
 | 
						|
        # We need to figure out where we are.
 | 
						|
 | 
						|
        video_element_table = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+"))
 | 
						|
 | 
						|
        if video_element_table is None:
 | 
						|
            # We are in stage 1
 | 
						|
            # The page is actually emtpy but contains the link to stage 2
 | 
						|
            content_link: Tag = cast(Tag, self._soup.select_one("#tab_series a"))
 | 
						|
            url: str = self._abs_url_from_link(content_link)
 | 
						|
            query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
 | 
						|
            url = url_set_query_params(url, query_params)
 | 
						|
            log.explain("Found ILIAS video frame page, fetching actual content next")
 | 
						|
            return [
 | 
						|
                IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")
 | 
						|
            ]
 | 
						|
 | 
						|
        is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
 | 
						|
 | 
						|
        if is_paginated and self._page_type != IliasElementType.OPENCAST_VIDEO_FOLDER:
 | 
						|
            # We are in stage 2 - try to break pagination
 | 
						|
            return self._find_opencast_video_entries_paginated()
 | 
						|
 | 
						|
        return self._find_opencast_video_entries_no_paging()
 | 
						|
 | 
						|
    def _find_opencast_video_entries_paginated(self) -> list[IliasPageElement]:
 | 
						|
        table_element = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+"))
 | 
						|
 | 
						|
        if table_element is None:
 | 
						|
            log.warn("Couldn't increase elements per page (table not found). I might miss elements.")
 | 
						|
            return self._find_opencast_video_entries_no_paging()
 | 
						|
 | 
						|
        id_match = re.match(r"tbl_xoct_(.+)", cast(str, table_element.attrs["id"]))
 | 
						|
        if id_match is None:
 | 
						|
            log.warn("Couldn't increase elements per page (table id not found). I might miss elements.")
 | 
						|
            return self._find_opencast_video_entries_no_paging()
 | 
						|
 | 
						|
        table_id = id_match.group(1)
 | 
						|
 | 
						|
        query_params = {f"tbl_xoct_{table_id}_trows": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
 | 
						|
        url = url_set_query_params(self._page_url, query_params)
 | 
						|
 | 
						|
        log.explain("Disabled pagination, retrying folder as a new entry")
 | 
						|
        return [IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")]
 | 
						|
 | 
						|
    def _find_opencast_video_entries_no_paging(self) -> list[IliasPageElement]:
 | 
						|
        """
 | 
						|
        Crawls the "second stage" video page. This page contains the actual video urls.
 | 
						|
        """
 | 
						|
        # Video start links are marked with an "Abspielen" link
 | 
						|
        video_links = cast(
 | 
						|
            list[Tag], self._soup.find_all(name="a", text=re.compile(r"\s*(Abspielen|Play)\s*"))
 | 
						|
        )
 | 
						|
 | 
						|
        results: list[IliasPageElement] = []
 | 
						|
 | 
						|
        for link in video_links:
 | 
						|
            results.append(self._listed_opencast_video_to_element(link))
 | 
						|
 | 
						|
        return results
 | 
						|
 | 
						|
    def _listed_opencast_video_to_element(self, link: Tag) -> IliasPageElement:
 | 
						|
        # The link is part of a table with multiple columns, describing metadata.
 | 
						|
        # 6th or 7th child (1 indexed) is the modification time string. Try to find it
 | 
						|
        # by parsing backwards from the end and finding something that looks like a date
 | 
						|
        modification_time = None
 | 
						|
        row: Tag = link.parent.parent.parent  # type: ignore
 | 
						|
        column_count = len(row.select("td.std"))
 | 
						|
        for index in range(column_count, 0, -1):
 | 
						|
            modification_string = cast(Tag, row.select_one(f"td.std:nth-child({index})")).get_text().strip()
 | 
						|
            if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
 | 
						|
                modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
 | 
						|
                break
 | 
						|
 | 
						|
        if modification_time is None:
 | 
						|
            log.warn(f"Could not determine upload time for {link}")
 | 
						|
            modification_time = datetime.now()
 | 
						|
 | 
						|
        title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
 | 
						|
        title += ".mp4"
 | 
						|
 | 
						|
        video_name: str = sanitize_path_name(title)
 | 
						|
 | 
						|
        video_url = self._abs_url_from_link(link)
 | 
						|
 | 
						|
        log.explain(f"Found video {video_name!r} at {video_url}")
 | 
						|
        return IliasPageElement.create_new(
 | 
						|
            IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time
 | 
						|
        )
 | 
						|
 | 
						|
    def _find_exercise_entries(self) -> list[IliasPageElement]:
 | 
						|
        if self._soup.find(id="tab_submission"):
 | 
						|
            log.explain("Found submission tab. This is an exercise detail or files page")
 | 
						|
            if self._soup.select_one("#tab_submission.active") is None:
 | 
						|
                log.explain("  This is a details page")
 | 
						|
                return self._find_exercise_entries_detail_page()
 | 
						|
            else:
 | 
						|
                log.explain("  This is a files page")
 | 
						|
                return self._find_exercise_entries_files_page()
 | 
						|
 | 
						|
        log.explain("Found no submission tab. This is an exercise root page")
 | 
						|
        return self._find_exercise_entries_root_page()
 | 
						|
 | 
						|
    def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]:
 | 
						|
        results: list[IliasPageElement] = []
 | 
						|
 | 
						|
        if link := self._soup.select_one("#tab_submission > a"):
 | 
						|
            results.append(
 | 
						|
                IliasPageElement.create_new(
 | 
						|
                    IliasElementType.EXERCISE_FILES, self._abs_url_from_link(link), "Submission"
 | 
						|
                )
 | 
						|
            )
 | 
						|
        else:
 | 
						|
            log.explain("Found no submission link for exercise, maybe it has not started yet?")
 | 
						|
 | 
						|
        # Find all download links in the container (this will contain all the *feedback* files)
 | 
						|
        download_links = cast(
 | 
						|
            list[Tag],
 | 
						|
            self._soup.find_all(
 | 
						|
                name="a",
 | 
						|
                # download links contain the given command class
 | 
						|
                attrs={"href": lambda x: x is not None and "cmd=download" in x},
 | 
						|
                text="Download",
 | 
						|
            ),
 | 
						|
        )
 | 
						|
 | 
						|
        for link in download_links:
 | 
						|
            parent_row: Tag = cast(
 | 
						|
                Tag, link.find_parent(attrs={"class": lambda x: x is not None and "row" in x})
 | 
						|
            )
 | 
						|
            name_tag = parent_row.find(name="div")
 | 
						|
 | 
						|
            if not name_tag:
 | 
						|
                log.warn("Could not find name tag for exercise entry")
 | 
						|
                _unexpected_html_warning()
 | 
						|
                continue
 | 
						|
 | 
						|
            name = sanitize_path_name(name_tag.get_text().strip())
 | 
						|
            log.explain(f"Found exercise detail entry {name!r}")
 | 
						|
 | 
						|
            results.append(
 | 
						|
                IliasPageElement.create_new(IliasElementType.FILE, self._abs_url_from_link(link), name)
 | 
						|
            )
 | 
						|
 | 
						|
        return results
 | 
						|
 | 
						|
    def _find_exercise_entries_files_page(self) -> list[IliasPageElement]:
 | 
						|
        results: list[IliasPageElement] = []
 | 
						|
 | 
						|
        # Find all download links in the container
 | 
						|
        download_links = cast(
 | 
						|
            list[Tag],
 | 
						|
            self._soup.find_all(
 | 
						|
                name="a",
 | 
						|
                # download links contain the given command class
 | 
						|
                attrs={"href": lambda x: x is not None and "cmd=download" in x},
 | 
						|
                text="Download",
 | 
						|
            ),
 | 
						|
        )
 | 
						|
 | 
						|
        for link in download_links:
 | 
						|
            parent_row: Tag = cast(Tag, link.find_parent("tr"))
 | 
						|
            children = cast(list[Tag], parent_row.find_all("td"))
 | 
						|
 | 
						|
            name = sanitize_path_name(children[1].get_text().strip())
 | 
						|
            log.explain(f"Found exercise file entry {name!r}")
 | 
						|
 | 
						|
            date = None
 | 
						|
            for child in reversed(children):
 | 
						|
                date = demangle_date(child.get_text().strip(), fail_silently=True)
 | 
						|
                if date is not None:
 | 
						|
                    break
 | 
						|
            if date is None:
 | 
						|
                log.warn(f"Date parsing failed for exercise file entry {name!r}")
 | 
						|
 | 
						|
            results.append(
 | 
						|
                IliasPageElement.create_new(IliasElementType.FILE, self._abs_url_from_link(link), name, date)
 | 
						|
            )
 | 
						|
 | 
						|
        return results
 | 
						|
 | 
						|
    def _find_exercise_entries_root_page(self) -> list[IliasPageElement]:
 | 
						|
        results: list[IliasPageElement] = []
 | 
						|
 | 
						|
        content_tab = self._soup.find(id="ilContentContainer")
 | 
						|
        if not content_tab:
 | 
						|
            log.warn("Could not find content tab in exercise overview page")
 | 
						|
            _unexpected_html_warning()
 | 
						|
            return []
 | 
						|
 | 
						|
        exercise_links = content_tab.select(".il-item-title a")
 | 
						|
 | 
						|
        for exercise in cast(list[Tag], exercise_links):
 | 
						|
            if "href" not in exercise.attrs:
 | 
						|
                continue
 | 
						|
            href = exercise.attrs["href"]
 | 
						|
            if type(href) is not str:
 | 
						|
                continue
 | 
						|
            if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
 | 
						|
                continue
 | 
						|
 | 
						|
            name = sanitize_path_name(exercise.get_text().strip())
 | 
						|
            results.append(
 | 
						|
                IliasPageElement.create_new(
 | 
						|
                    IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        for result in results:
 | 
						|
            log.explain(f"Found exercise {result.name!r}")
 | 
						|
 | 
						|
        return results
 | 
						|
 | 
						|
    def _find_normal_entries(self) -> list[IliasPageElement]:
 | 
						|
        result: list[IliasPageElement] = []
 | 
						|
 | 
						|
        links: list[Tag] = []
 | 
						|
        # Fetch all links and throw them to the general interpreter
 | 
						|
        if self._is_course_overview_page():
 | 
						|
            log.explain("Page is a course overview page, adjusting link selector")
 | 
						|
            links.extend(self._soup.select(".il-item-title > a"))
 | 
						|
        else:
 | 
						|
            links.extend(self._soup.select("a.il_ContainerItemTitle"))
 | 
						|
 | 
						|
        for link in links:
 | 
						|
            abs_url = self._abs_url_from_link(link)
 | 
						|
            # Make sure parents are sanitized. We do not want accidental parents
 | 
						|
            parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
 | 
						|
 | 
						|
            if parents:
 | 
						|
                element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text())
 | 
						|
            else:
 | 
						|
                element_name = sanitize_path_name(link.get_text())
 | 
						|
 | 
						|
            element_type = IliasPage._find_type_for_element(
 | 
						|
                element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
 | 
						|
            )
 | 
						|
            description = IliasPage._find_link_description(link)
 | 
						|
 | 
						|
            # The last meeting on every page is expanded by default.
 | 
						|
            # Its content is then shown inline *and* in the meeting page itself.
 | 
						|
            # We should skip the inline content.
 | 
						|
            if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(link):
 | 
						|
                continue
 | 
						|
 | 
						|
            if not element_type:
 | 
						|
                continue
 | 
						|
            elif element_type == IliasElementType.FILE:
 | 
						|
                result.append(IliasPage._file_to_element(element_name, abs_url, link))
 | 
						|
                continue
 | 
						|
 | 
						|
            log.explain(f"Found {element_name!r} of type {element_type}")
 | 
						|
            result.append(
 | 
						|
                IliasPageElement.create_new(
 | 
						|
                    element_type, abs_url, element_name, description=description, skip_sanitize=True
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        result += self._find_cards()
 | 
						|
        result += self._find_mediacast_videos()
 | 
						|
        result += self._find_mob_videos()
 | 
						|
 | 
						|
        return result
 | 
						|
 | 
						|
    def _find_mediacast_videos(self) -> list[IliasPageElement]:
 | 
						|
        videos: list[IliasPageElement] = []
 | 
						|
 | 
						|
        regex = re.compile(r"il\.VideoPlaylist\.init.+?\[(.+?)], ")
 | 
						|
        for script in cast(list[Tag], self._soup.find_all("script")):
 | 
						|
            for match in regex.finditer(script.text):
 | 
						|
                try:
 | 
						|
                    playlist = json.loads("[" + match.group(1) + "]")
 | 
						|
                except json.JSONDecodeError:
 | 
						|
                    log.warn("Could not decode playlist json")
 | 
						|
                    log.warn_contd(f"Playlist json: [{match.group(1)}]")
 | 
						|
                    continue
 | 
						|
                for elem in playlist:
 | 
						|
                    title = elem.get("title", None)
 | 
						|
                    description = elem.get("description", None)
 | 
						|
                    url = elem.get("resource", None)
 | 
						|
                    if title is None or description is None or url is None:
 | 
						|
                        log.explain(f"Mediacast json: {match.group(1)}")
 | 
						|
                        log.warn("Mediacast video json was not complete")
 | 
						|
                    if title is None:
 | 
						|
                        log.warn_contd("Missing title")
 | 
						|
                    if description is None:
 | 
						|
                        log.warn_contd("Missing description")
 | 
						|
                    if url is None:
 | 
						|
                        log.warn_contd("Missing URL")
 | 
						|
 | 
						|
                    if not title.endswith(".mp4") and not title.endswith(".webm"):
 | 
						|
                        # just to make sure it has some kinda-alrightish ending
 | 
						|
                        title = title + ".mp4"
 | 
						|
                    videos.append(
 | 
						|
                        IliasPageElement.create_new(
 | 
						|
                            typ=IliasElementType.MEDIACAST_VIDEO,
 | 
						|
                            url=self._abs_url_from_relative(cast(str, url)),
 | 
						|
                            name=sanitize_path_name(title),
 | 
						|
                        )
 | 
						|
                    )
 | 
						|
 | 
						|
        return videos
 | 
						|
 | 
						|
    def _find_mob_videos(self) -> list[IliasPageElement]:
 | 
						|
        videos: list[IliasPageElement] = []
 | 
						|
 | 
						|
        selector = "figure.ilc_media_cont_MediaContainerHighlighted,figure.ilc_media_cont_MediaContainer"
 | 
						|
        for figure in self._soup.select(selector):
 | 
						|
            video_element = figure.select_one("video")
 | 
						|
            if not video_element:
 | 
						|
                continue
 | 
						|
 | 
						|
            url, title = self._find_mob_video_url_title(video_element, figure)
 | 
						|
 | 
						|
            if url is None:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"No <source> element found for mob video '{title}'")
 | 
						|
                continue
 | 
						|
 | 
						|
            if urlparse(url).hostname != urlparse(self._page_url).hostname:
 | 
						|
                log.explain(f"Found external video at {url}, ignoring")
 | 
						|
                continue
 | 
						|
 | 
						|
            videos.append(
 | 
						|
                IliasPageElement.create_new(
 | 
						|
                    typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        return videos
 | 
						|
 | 
						|
    def _find_mob_video_url_title(self, video_element: Tag, figure: Tag) -> tuple[Optional[str], str]:
 | 
						|
        url = None
 | 
						|
        for source in video_element.select("source"):
 | 
						|
            if source.get("type", "") == "video/mp4":
 | 
						|
                url = cast(Optional[str], source.get("src"))
 | 
						|
                break
 | 
						|
 | 
						|
        if url is None and video_element.get("src"):
 | 
						|
            url = cast(Optional[str], video_element.get("src"))
 | 
						|
 | 
						|
        fig_caption = figure.select_one("figcaption")
 | 
						|
        if fig_caption:
 | 
						|
            title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4"
 | 
						|
        elif url is not None:
 | 
						|
            path = urlparse(self._abs_url_from_relative(url)).path
 | 
						|
            title = path.rsplit("/", 1)[-1]
 | 
						|
        else:
 | 
						|
            title = f"unknown video {figure}"
 | 
						|
 | 
						|
        if url:
 | 
						|
            url = self._abs_url_from_relative(url)
 | 
						|
 | 
						|
        return url, title
 | 
						|
 | 
						|
    def _is_in_expanded_meeting(self, tag: Tag) -> bool:
 | 
						|
        """
 | 
						|
        Returns whether a file is part of an expanded meeting.
 | 
						|
        Has false positives for meetings themselves as their title is also "in the expanded meeting content".
 | 
						|
        It is in the same general div and this whole thing is guesswork.
 | 
						|
        Therefore, you should check for meetings before passing them in this function.
 | 
						|
        """
 | 
						|
        parents: list[Tag] = list(tag.parents)
 | 
						|
        for parent in parents:
 | 
						|
            if not parent.get("class"):
 | 
						|
                continue
 | 
						|
 | 
						|
            # We should not crawl files under meetings
 | 
						|
            if "ilContainerListItemContentCB" in cast(str, parent.get("class")):
 | 
						|
                link: Tag = cast(Tag, cast(Tag, parent.parent).find("a"))
 | 
						|
                typ = IliasPage._find_type_for_element(
 | 
						|
                    "meeting",
 | 
						|
                    self._abs_url_from_link(link),
 | 
						|
                    lambda: IliasPage._find_icon_for_folder_entry(link),
 | 
						|
                )
 | 
						|
                return typ == IliasElementType.MEETING
 | 
						|
 | 
						|
        return False
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _find_upwards_folder_hierarchy(tag: Tag) -> list[str]:
 | 
						|
        """
 | 
						|
        Interprets accordions and expandable blocks as virtual folders and returns them
 | 
						|
        in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test"
 | 
						|
        """
 | 
						|
        found_titles = []
 | 
						|
 | 
						|
        if None == "hey":
 | 
						|
            pass
 | 
						|
 | 
						|
        outer_accordion_content: Optional[Tag] = None
 | 
						|
 | 
						|
        parents: list[Tag] = list(tag.parents)
 | 
						|
        for parent in parents:
 | 
						|
            if not parent.get("class"):
 | 
						|
                continue
 | 
						|
 | 
						|
            # ILIAS has proper accordions and weird blocks that look like normal headings,
 | 
						|
            # but some JS later transforms them into an accordion.
 | 
						|
 | 
						|
            # This is for these weird JS-y blocks and custom item groups
 | 
						|
            if "ilContainerItemsContainer" in cast(str, parent.get("class")):
 | 
						|
                data_store_url = cast(str, cast(Tag, parent.parent).get("data-store-url", "")).lower()
 | 
						|
                is_custom_item_group = (
 | 
						|
                    "baseclass=ilcontainerblockpropertiesstoragegui" in data_store_url
 | 
						|
                    and "cont_block_id=" in data_store_url
 | 
						|
                )
 | 
						|
                # I am currently under the impression that *only* those JS blocks have an
 | 
						|
                # ilNoDisplay class.
 | 
						|
                if not is_custom_item_group and "ilNoDisplay" not in cast(str, parent.get("class")):
 | 
						|
                    continue
 | 
						|
                prev = cast(Tag, parent.find_previous_sibling("div"))
 | 
						|
                if "ilContainerBlockHeader" in cast(str, prev.get("class")):
 | 
						|
                    if prev.find("h3"):
 | 
						|
                        found_titles.append(cast(Tag, prev.find("h3")).get_text().strip())
 | 
						|
                    else:
 | 
						|
                        found_titles.append(cast(Tag, prev.find("h2")).get_text().strip())
 | 
						|
 | 
						|
            # And this for real accordions
 | 
						|
            if "il_VAccordionContentDef" in cast(str, parent.get("class")):
 | 
						|
                outer_accordion_content = parent
 | 
						|
                break
 | 
						|
 | 
						|
        if outer_accordion_content:
 | 
						|
            accordion_tag = cast(Tag, outer_accordion_content.parent)
 | 
						|
            head_tag = cast(
 | 
						|
                Tag,
 | 
						|
                accordion_tag.find(
 | 
						|
                    attrs={
 | 
						|
                        "class": lambda x: x is not None
 | 
						|
                        and ("ilc_va_ihead_VAccordIHead" in x or "ilc_va_ihead_AccordIHead" in x)
 | 
						|
                    }
 | 
						|
                ),
 | 
						|
            )
 | 
						|
            found_titles.append(head_tag.get_text().strip())
 | 
						|
 | 
						|
        return [sanitize_path_name(x) for x in reversed(found_titles)]
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _find_link_description(link: Tag) -> Optional[str]:
 | 
						|
        tile = cast(
 | 
						|
            Tag, link.find_parent("div", {"class": lambda x: x is not None and "il_ContainerListItem" in x})
 | 
						|
        )
 | 
						|
        if not tile:
 | 
						|
            return None
 | 
						|
        description_element = cast(
 | 
						|
            Tag, tile.find("div", {"class": lambda x: x is not None and "il_Description" in x})
 | 
						|
        )
 | 
						|
        if not description_element:
 | 
						|
            return None
 | 
						|
        return description_element.get_text().strip()
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _file_to_element(name: str, url: str, link_element: Tag) -> IliasPageElement:
 | 
						|
        # Files have a list of properties (type, modification date, size, etc.)
 | 
						|
        # In a series of divs.
 | 
						|
        # Find the parent containing all those divs, so we can filter our what we need
 | 
						|
        properties_parent = cast(
 | 
						|
            Tag,
 | 
						|
            cast(
 | 
						|
                Tag,
 | 
						|
                link_element.find_parent(
 | 
						|
                    "div", {"class": lambda x: x is not None and "il_ContainerListItem" in x}
 | 
						|
                ),
 | 
						|
            ).select_one(".il_ItemProperties"),
 | 
						|
        )
 | 
						|
        # The first one is always the filetype
 | 
						|
        file_type = cast(Tag, properties_parent.select_one("span.il_ItemProperty")).get_text().strip()
 | 
						|
 | 
						|
        # The rest does not have a stable order. Grab the whole text and reg-ex the date
 | 
						|
        # out of it
 | 
						|
        all_properties_text = properties_parent.get_text().strip()
 | 
						|
        modification_date = IliasPage._find_date_in_text(all_properties_text)
 | 
						|
        if modification_date is None:
 | 
						|
            log.explain(f"Element {name} at {url} has no date.")
 | 
						|
 | 
						|
        # Grab the name from the link text
 | 
						|
        full_path = name + "." + file_type
 | 
						|
 | 
						|
        log.explain(f"Found file {full_path!r}")
 | 
						|
        return IliasPageElement.create_new(
 | 
						|
            IliasElementType.FILE, url, full_path, modification_date, skip_sanitize=True
 | 
						|
        )
 | 
						|
 | 
						|
    def _find_cards(self) -> list[IliasPageElement]:
 | 
						|
        result: list[IliasPageElement] = []
 | 
						|
 | 
						|
        card_titles: list[Tag] = self._soup.select(".card-title a")
 | 
						|
 | 
						|
        for title in card_titles:
 | 
						|
            url = self._abs_url_from_link(title)
 | 
						|
            name = sanitize_path_name(title.get_text().strip())
 | 
						|
            typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
 | 
						|
 | 
						|
            if not typ:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"Could not extract type for {title}")
 | 
						|
                continue
 | 
						|
 | 
						|
            result.append(IliasPageElement.create_new(typ, url, name))
 | 
						|
 | 
						|
        card_button_tiles: list[Tag] = self._soup.select(".card-title button")
 | 
						|
 | 
						|
        for button in card_button_tiles:
 | 
						|
            signal_regex = re.compile("#" + str(button["id"]) + r"[\s\S]*?\.trigger\('(.+?)'")
 | 
						|
            signal_match = signal_regex.search(str(self._soup))
 | 
						|
            if not signal_match:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"Could not find click handler signal for {button}")
 | 
						|
                continue
 | 
						|
            signal = signal_match.group(1)
 | 
						|
            open_regex = re.compile(r"\.on\('" + signal + r"[\s\S]*?window.open\(['\"](.+?)['\"]")
 | 
						|
            open_match = open_regex.search(str(self._soup))
 | 
						|
            if not open_match:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
 | 
						|
                continue
 | 
						|
            url = self._abs_url_from_relative(open_match.group(1))
 | 
						|
            name = sanitize_path_name(button.get_text().strip())
 | 
						|
            typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
 | 
						|
            caption_parent = cast(
 | 
						|
                Tag,
 | 
						|
                button.find_parent(
 | 
						|
                    "div",
 | 
						|
                    attrs={"class": lambda x: x is not None and "caption" in x},
 | 
						|
                ),
 | 
						|
            )
 | 
						|
            caption_container = caption_parent.find_next_sibling("div")
 | 
						|
            description = caption_container.get_text().strip() if caption_container else None
 | 
						|
 | 
						|
            if not typ:
 | 
						|
                _unexpected_html_warning()
 | 
						|
                log.warn_contd(f"Could not extract type for {button}")
 | 
						|
                continue
 | 
						|
 | 
						|
            result.append(IliasPageElement.create_new(typ, url, name, description=description))
 | 
						|
 | 
						|
        return result
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _find_type_for_element(
 | 
						|
        element_name: str,
 | 
						|
        url: str,
 | 
						|
        icon_for_element: Callable[[], Optional[Tag]],
 | 
						|
    ) -> Optional[IliasElementType]:
 | 
						|
        """
 | 
						|
        Decides which sub crawler to use for a given top level element.
 | 
						|
        """
 | 
						|
        parsed_url = urlparse(url)
 | 
						|
        icon = icon_for_element()
 | 
						|
 | 
						|
        def try_matcher(matcher: IliasElementMatcher) -> bool:
 | 
						|
            match matcher:
 | 
						|
                case TypeMatcher.All(matchers=ms):
 | 
						|
                    return all(try_matcher(m) for m in ms)
 | 
						|
                case TypeMatcher.Any(matchers=ms):
 | 
						|
                    return any(try_matcher(m) for m in ms)
 | 
						|
                case TypeMatcher.ImgAlt(alt=alt):
 | 
						|
                    return icon is not None and alt in str(icon["alt"]).lower()
 | 
						|
                case TypeMatcher.ImgSrc(src=src):
 | 
						|
                    return icon is not None and src in str(icon["src"]).lower()
 | 
						|
                case TypeMatcher.UrlPath(path=path):
 | 
						|
                    return path in parsed_url.path.lower()
 | 
						|
                case TypeMatcher.UrlParameter(query=query):
 | 
						|
                    return query in parsed_url.query.lower()
 | 
						|
 | 
						|
            raise CrawlError(f"Unknown matcher {matcher}")
 | 
						|
 | 
						|
        for typ in IliasElementType:
 | 
						|
            if try_matcher(typ.matcher()):
 | 
						|
                return typ
 | 
						|
 | 
						|
        _unexpected_html_warning()
 | 
						|
        log.warn_contd(f"Tried to figure out element type, but failed for {element_name!r} / {url!r})")
 | 
						|
 | 
						|
        if "ref_id=" in parsed_url.query.lower() or "goto.php" in parsed_url.path.lower():
 | 
						|
            log.warn_contd("Defaulting to FOLDER as it contains a ref_id/goto")
 | 
						|
            return IliasElementType.FOLDER
 | 
						|
 | 
						|
        return None
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _find_icon_for_folder_entry(link_element: Tag) -> Optional[Tag]:
 | 
						|
        found_parent: Optional[Tag] = None
 | 
						|
 | 
						|
        # We look for the outer div of our inner link, to find information around it
 | 
						|
        # (mostly the icon)
 | 
						|
        for parent in link_element.parents:
 | 
						|
            if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]:
 | 
						|
                found_parent = parent
 | 
						|
                break
 | 
						|
 | 
						|
        if found_parent is None:
 | 
						|
            _unexpected_html_warning()
 | 
						|
            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {link_element!r}")
 | 
						|
            return None
 | 
						|
 | 
						|
        # Find the small descriptive icon to figure out the type
 | 
						|
        img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon")
 | 
						|
 | 
						|
        if img_tag is None:
 | 
						|
            img_tag = found_parent.select_one("img.icon")
 | 
						|
 | 
						|
        is_session_expansion_button = found_parent.find(
 | 
						|
            "a", attrs={"href": lambda x: x is not None and ("crs_next_sess=" in x or "crs_prev_sess=" in x)}
 | 
						|
        )
 | 
						|
        if img_tag is None and is_session_expansion_button:
 | 
						|
            log.explain("Found session expansion button, skipping it as it has no content")
 | 
						|
            return None
 | 
						|
 | 
						|
        if img_tag is not None:
 | 
						|
            return img_tag
 | 
						|
 | 
						|
        log.explain(f"Tried to figure out element type, but did not find an image for {link_element!r}")
 | 
						|
        return None
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _find_icon_from_card(card_title: Tag) -> Optional[Tag]:
 | 
						|
        def is_card_root(element: Tag) -> bool:
 | 
						|
            return "il-card" in element["class"] and "thumbnail" in element["class"]
 | 
						|
 | 
						|
        card_root: Optional[Tag] = None
 | 
						|
 | 
						|
        # We look for the card root
 | 
						|
        for parent in card_title.parents:
 | 
						|
            if is_card_root(parent):
 | 
						|
                card_root = parent
 | 
						|
                break
 | 
						|
 | 
						|
        if card_root is None:
 | 
						|
            _unexpected_html_warning()
 | 
						|
            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
 | 
						|
            return None
 | 
						|
 | 
						|
        return cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def is_logged_in(ilias_soup: IliasSoup) -> bool:
 | 
						|
        soup = ilias_soup.soup
 | 
						|
        # Normal ILIAS pages
 | 
						|
        mainbar = soup.find(class_="il-maincontrols-metabar")
 | 
						|
        if mainbar is not None:
 | 
						|
            login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x})
 | 
						|
            shib_login = soup.find(id="button_shib_login")
 | 
						|
            return not login_button and not shib_login
 | 
						|
 | 
						|
        # Personal Desktop
 | 
						|
        if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}):
 | 
						|
            return True
 | 
						|
 | 
						|
        # Empty personal desktop has zero (0) markers. Match on the text...
 | 
						|
        if alert := soup.select_one(".alert-info"):
 | 
						|
            text = alert.get_text().lower()
 | 
						|
            if "you have not yet selected any favourites" in text:
 | 
						|
                return True
 | 
						|
            if "sie haben aktuell noch keine favoriten ausgewählt" in text:
 | 
						|
                return True
 | 
						|
 | 
						|
        # Video listing embeds do not have complete ILIAS html. Try to match them by
 | 
						|
        # their video listing table
 | 
						|
        video_table = soup.find(
 | 
						|
            recursive=True, name="table", attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
 | 
						|
        )
 | 
						|
        if video_table is not None:
 | 
						|
            return True
 | 
						|
        # The individual video player wrapper page has nothing of the above.
 | 
						|
        # Match it by its playerContainer.
 | 
						|
        return soup.select_one("#playerContainer") is not None
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def _find_date_in_text(text: str) -> Optional[datetime]:
 | 
						|
        modification_date_match = re.search(
 | 
						|
            r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", text
 | 
						|
        )
 | 
						|
        if modification_date_match is not None:
 | 
						|
            modification_date_str = modification_date_match.group(1)
 | 
						|
            return demangle_date(modification_date_str)
 | 
						|
        return None
 | 
						|
 | 
						|
    def get_permalink(self) -> Optional[str]:
 | 
						|
        return IliasPage.get_soup_permalink(self._ilias_soup)
 | 
						|
 | 
						|
    def _abs_url_from_link(self, link_tag: Tag) -> str:
 | 
						|
        """
 | 
						|
        Create an absolute url from an <a> tag.
 | 
						|
        """
 | 
						|
        return self._abs_url_from_relative(cast(str, link_tag.get("href")))
 | 
						|
 | 
						|
    def _abs_url_from_relative(self, relative_url: str) -> str:
 | 
						|
        """
 | 
						|
        Create an absolute url from a relative URL.
 | 
						|
        """
 | 
						|
        return urljoin(self._page_url, relative_url)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def get_soup_permalink(ilias_soup: IliasSoup) -> Optional[str]:
 | 
						|
        scripts = cast(list[Tag], ilias_soup.soup.find_all("script"))
 | 
						|
        pattern = re.compile(r"il\.Footer\.permalink\.copyText\(\"(.+?)\"\)")
 | 
						|
        for script in scripts:
 | 
						|
            if match := pattern.search(script.text):
 | 
						|
                url = match.group(1)
 | 
						|
                url = url.replace(r"\/", "/")
 | 
						|
                return url
 | 
						|
        return None
 | 
						|
 | 
						|
 | 
						|
def _unexpected_html_warning() -> None:
 | 
						|
    log.warn("Encountered unexpected HTML structure, ignoring element.")
 | 
						|
 | 
						|
 | 
						|
german_months = ["Jan", "Feb", "Mär", "Apr", "Mai", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez"]
 | 
						|
english_months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
 | 
						|
 | 
						|
 | 
						|
def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]:
 | 
						|
    """
 | 
						|
    Demangle a given date in one of the following formats (hour/minute part is optional):
 | 
						|
    "Gestern, HH:MM"
 | 
						|
    "Heute, HH:MM"
 | 
						|
    "Morgen, HH:MM"
 | 
						|
    "dd. mon yyyy, HH:MM
 | 
						|
    """
 | 
						|
    try:
 | 
						|
        # Normalize whitespace because users
 | 
						|
        date_str = re.sub(r"\s+", " ", date_str)
 | 
						|
 | 
						|
        date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, flags=re.I)
 | 
						|
        date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, flags=re.I)
 | 
						|
        date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, flags=re.I)
 | 
						|
        date_str = date_str.strip()
 | 
						|
        for german, english in zip(german_months, english_months, strict=True):
 | 
						|
            date_str = date_str.replace(german, english)
 | 
						|
            # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020"
 | 
						|
            date_str = date_str.replace(english + ".", english)
 | 
						|
 | 
						|
        # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy"
 | 
						|
 | 
						|
        # Check if we have a time as well
 | 
						|
        if ", " in date_str:
 | 
						|
            day_part, time_part = date_str.split(",")
 | 
						|
        else:
 | 
						|
            day_part = date_str.split(",")[0]
 | 
						|
            time_part = None
 | 
						|
 | 
						|
        day_str, month_str, year_str = day_part.split(" ")
 | 
						|
 | 
						|
        day = int(day_str.strip().replace(".", ""))
 | 
						|
        month = english_months.index(month_str.strip()) + 1
 | 
						|
        year = int(year_str.strip())
 | 
						|
 | 
						|
        if time_part:
 | 
						|
            hour_str, minute_str = time_part.split(":")
 | 
						|
            hour = int(hour_str)
 | 
						|
            minute = int(minute_str)
 | 
						|
            return datetime(year, month, day, hour, minute)
 | 
						|
 | 
						|
        return datetime(year, month, day)
 | 
						|
    except Exception:
 | 
						|
        if not fail_silently:
 | 
						|
            log.warn(f"Date parsing failed for {date_str!r}")
 | 
						|
        return None
 | 
						|
 | 
						|
 | 
						|
def _format_date_english(date_to_format: date) -> str:
 | 
						|
    month = english_months[date_to_format.month - 1]
 | 
						|
    return f"{date_to_format.day:02d}. {month} {date_to_format.year:04d}"
 | 
						|
 | 
						|
 | 
						|
def _yesterday() -> date:
 | 
						|
    return date.today() - timedelta(days=1)
 | 
						|
 | 
						|
 | 
						|
def _tomorrow() -> date:
 | 
						|
    return date.today() + timedelta(days=1)
 | 
						|
 | 
						|
 | 
						|
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
 | 
						|
    elements = []
 | 
						|
    for p in forum_export.select("body > p"):
 | 
						|
        title_tag = p
 | 
						|
        content_tag = p.find_next_sibling("ul")
 | 
						|
 | 
						|
        title = cast(Tag, p.find("b")).text
 | 
						|
        if ":" in title:
 | 
						|
            title = title[title.find(":") + 1 :]
 | 
						|
        title = title.strip()
 | 
						|
 | 
						|
        if not content_tag or content_tag.find_previous_sibling("p") != title_tag:
 | 
						|
            # ILIAS allows users to delete the initial post while keeping the thread open
 | 
						|
            # This produces empty threads without *any* content.
 | 
						|
            # I am not sure why you would want this, but ILIAS makes it easy to do.
 | 
						|
            elements.append(IliasForumThread(title, title_tag, forum_export.new_tag("ul"), None))
 | 
						|
            continue
 | 
						|
 | 
						|
        mtime = _guess_timestamp_from_forum_post_content(content_tag)
 | 
						|
        elements.append(IliasForumThread(title, title_tag, content_tag, mtime))
 | 
						|
 | 
						|
    return elements
 | 
						|
 | 
						|
 | 
						|
def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]:
 | 
						|
    posts = cast(Optional[Tag], content.select(".ilFrmPostHeader > span.small"))
 | 
						|
    if not posts:
 | 
						|
        return None
 | 
						|
 | 
						|
    newest_date: Optional[datetime] = None
 | 
						|
 | 
						|
    for post in posts:
 | 
						|
        text = post.text.strip()
 | 
						|
        text = text[text.rfind("|") + 1 :]
 | 
						|
        date = demangle_date(text, fail_silently=True)
 | 
						|
        if not date:
 | 
						|
            continue
 | 
						|
 | 
						|
        if not newest_date or newest_date < date:
 | 
						|
            newest_date = date
 | 
						|
 | 
						|
    return newest_date
 |