Fix exercise crawling

2026-01-31 06:32:24 +01:00 · 2025-04-25 13:45:57 +02:00
parent b305e1ce23
commit 4c230ef6dd
3 changed files with 95 additions and 65 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@ ambiguous situations.
 ## Fixed
 - Ilias-native login
 - Exercise crawling
 ## 3.8.1 - 2025-04-17
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -107,6 +107,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
 _DIRECTORY_PAGES: Set[IliasElementType] = {
    IliasElementType.EXERCISE,
    IliasElementType.EXERCISE_FILES,
    IliasElementType.EXERCISE_OVERVIEW,
    IliasElementType.FOLDER,
    IliasElementType.INFO_TAB,
    IliasElementType.MEDIACAST_VIDEO_FOLDER,
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -97,7 +97,8 @@ class IliasElementType(Enum):
    BOOKING = "booking"
    COURSE = "course"
    DCL_RECORD_LIST = "dcl_record_list"
-    EXERCISE = "exercise"
+    EXERCISE_OVERVIEW = "exercise_overview"
    EXERCISE = "exercise"  # own submitted files
    EXERCISE_FILES = "exercise_files"  # own submitted files
    FILE = "file"
    FOLDER = "folder"
@@ -141,13 +142,15 @@ class IliasElementType(Enum):
                    TypeMatcher.query("cmdclass=ildclrecordlistgui")
                )
            case IliasElementType.EXERCISE:
                return TypeMatcher.never()
            case IliasElementType.EXERCISE_FILES:
                return TypeMatcher.never()
            case IliasElementType.EXERCISE_OVERVIEW:
                return TypeMatcher.any(
                    TypeMatcher.path("/exc/"),
                    TypeMatcher.path("_exc_"),
                    TypeMatcher.img_src("_exc.svg"),
                )
            case IliasElementType.EXERCISE_FILES:
                return TypeMatcher.never()
            case IliasElementType.FILE:
                return TypeMatcher.any(
                    TypeMatcher.query("cmd=sendfile"),
@@ -530,6 +533,8 @@ class IliasPage:
        if self._contains_collapsed_future_meetings():
            log.explain("Requesting *all* future meetings")
            return self._uncollapse_future_meetings_url()
        if self._is_exercise_not_all_shown():
            return self._show_all_exercises()
        if not self._is_content_tab_selected():
            if self._page_type != IliasElementType.INFO_TAB:
                log.explain("Selecting content tab")
@@ -561,7 +566,7 @@ class IliasPage:
    def _is_exercise_file(self) -> bool:
        # we know it from before
-        if self._page_type == IliasElementType.EXERCISE:
+        if self._page_type == IliasElementType.EXERCISE_OVERVIEW:
            return True
        # We have no suitable parent - let's guesss
@@ -598,6 +603,17 @@ class IliasPage:
        link = self._abs_url_from_link(element)
        return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings")
    def _is_exercise_not_all_shown(self) -> bool:
        return (self._page_type == IliasElementType.EXERCISE_OVERVIEW
                and "mode=all" not in self._page_url.lower())
    def _show_all_exercises(self) -> Optional[IliasPageElement]:
        return IliasPageElement.create_new(
            IliasElementType.EXERCISE_OVERVIEW,
            self._page_url + "&mode=all",
            "show all exercises"
        )
    def _is_content_tab_selected(self) -> bool:
        return self._select_content_page_url() is None
@@ -863,15 +879,62 @@ class IliasPage:
    def _find_exercise_entries(self) -> list[IliasPageElement]:
        if self._soup.find(id="tab_submission"):
-            log.explain("Found submission tab. This is an exercise detail page")
+            log.explain("Found submission tab. This is an exercise detail or files page")
-            return self._find_exercise_entries_detail_page()
+            if self._soup.select_one("#tab_submission.active") is None:
                log.explain("  This is a details page")
                return self._find_exercise_entries_detail_page()
            else:
                log.explain("  This is a files page")
                return self._find_exercise_entries_files_page()
        log.explain("Found no submission tab. This is an exercise root page")
        return self._find_exercise_entries_root_page()
    def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]:
        results: list[IliasPageElement] = []
-        # Find all download links in the container (this will contain all the files)
+        if link := cast(Optional[Tag], self._soup.select_one("#tab_submission > a")):
            results.append(IliasPageElement.create_new(
                IliasElementType.EXERCISE_FILES,
                self._abs_url_from_link(link),
                "Submission"
            ))
        else:
            log.explain("Found no submission link for exercise, maybe it has not started yet?")
        # Find all download links in the container (this will contain all the *feedback* files)
        download_links = cast(list[Tag], self._soup.find_all(
            name="a",
            # download links contain the given command class
            attrs={"href": lambda x: x is not None and "cmd=download" in x},
            text="Download"
        ))
        for link in download_links:
            parent_row: Tag = cast(Tag, link.find_parent(
                attrs={"class": lambda x: x is not None and "row" in x}))
            name_tag = cast(Optional[Tag], parent_row.find(name="div"))
            if not name_tag:
                log.warn("Could not find name tag for exercise entry")
                _unexpected_html_warning()
                continue
            name = _sanitize_path_name(name_tag.get_text().strip())
            log.explain(f"Found exercise detail entry {name!r}")
            results.append(IliasPageElement.create_new(
                IliasElementType.FILE,
                self._abs_url_from_link(link),
                name
            ))
        return results
    def _find_exercise_entries_files_page(self) -> list[IliasPageElement]:
        results: list[IliasPageElement] = []
        # Find all download links in the container
        download_links = cast(list[Tag], self._soup.find_all(
            name="a",
            # download links contain the given command class
@@ -884,7 +947,7 @@ class IliasPage:
            children = cast(list[Tag], parent_row.find_all("td"))
            name = _sanitize_path_name(children[1].get_text().strip())
-            log.explain(f"Found exercise detail entry {name!r}")
+            log.explain(f"Found exercise file entry {name!r}")
            date = None
            for child in reversed(children):
@@ -892,7 +955,7 @@ class IliasPage:
                if date is not None:
                    break
            if date is None:
-                log.warn(f"Date parsing failed for exercise entry {name!r}")
+                log.warn(f"Date parsing failed for exercise file entry {name!r}")
            results.append(IliasPageElement.create_new(
                IliasElementType.FILE,
@@ -906,66 +969,31 @@ class IliasPage:
    def _find_exercise_entries_root_page(self) -> list[IliasPageElement]:
        results: list[IliasPageElement] = []
-        # Each assignment is in an accordion container
+        content_tab = cast(Optional[Tag], self._soup.find(id="ilContentContainer"))
-        assignment_containers: list[Tag] = self._soup.select(".il_VAccordionInnerContainer")
+        if not content_tab:
            log.warn("Could not find content tab in exercise overview page")
            _unexpected_html_warning()
            return []
-        for container in assignment_containers:
+        individual_exercises = content_tab.find_all(
-            # Fetch the container name out of the header to use it in the path
+            name="a",
-            container_name = cast(Tag, container.select_one(".ilAssignmentHeader")).get_text().strip()
+            attrs={
-            log.explain(f"Found exercise container {container_name!r}")
+                "href": lambda x: x is not None
                and "ass_id=" in x
                and "cmdClass=ilAssignmentPresentationGUI" in x
            }
        )
-            # Find all download links in the container (this will contain all the files)
+        for exercise in cast(list[Tag], individual_exercises):
-            files = cast(list[Tag], container.find_all(
+            name = _sanitize_path_name(exercise.get_text().strip())
-                name="a",
+            results.append(IliasPageElement.create_new(
-                # download links contain the given command class
+                IliasElementType.EXERCISE,
-                attrs={"href": lambda x: x is not None and "cmdClass=ilexsubmissiongui" in x},
+                self._abs_url_from_link(exercise),
-                text="Download"
+                name
            ))
-            # Grab each file as you now have the link
+        for result in results:
-            for file_link in files:
+            log.explain(f"Found exercise {result.name!r}")
                # Two divs, side by side. Left is the name, right is the link ==> get left
                # sibling
                file_name = cast(
                    Tag,
                    cast(Tag, file_link.parent).find_previous(name="div")
                ).get_text().strip()
                url = self._abs_url_from_link(file_link)
                log.explain(f"Found exercise entry {file_name!r}")
                results.append(IliasPageElement.create_new(
                    IliasElementType.FILE,
                    url,
                    _sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
                    mtime=None,  # We do not have any timestamp
                    skip_sanitize=True
                ))
            # Find all links to file listings (e.g. "Submitted Files" for groups)
            file_listings = cast(list[Tag], container.find_all(
                name="a",
                # download links contain the given command class
                attrs={"href": lambda x: x is not None and "cmdclass=ilexsubmissionfilegui" in x.lower()}
            ))
            # Add each listing as a new
            for listing in file_listings:
                parent_container = cast(Tag, listing.find_parent(
                    "div", attrs={"class": lambda x: x is not None and "form-group" in x}
                ))
                label_container = cast(Tag, parent_container.find(
                    attrs={"class": lambda x: x is not None and "control-label" in x}
                ))
                file_name = label_container.get_text().strip()
                url = self._abs_url_from_link(listing)
                log.explain(f"Found exercise detail {file_name!r} at {url}")
                results.append(IliasPageElement.create_new(
                    IliasElementType.EXERCISE_FILES,
                    url,
                    _sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
                    None,  # we do not have any timestamp
                    skip_sanitize=True
                ))
        return results