Add support for exercises in ILIAS crawler

2026-01-09 05:52:30 +01:00 · 2021-05-15 21:40:17 +02:00
parent 7d323ec62b
commit c454fabc9d
1 changed files with 49 additions and 0 deletions
--- a/PFERD/crawlers/ilias.py
+++ b/PFERD/crawlers/ilias.py
@@ -93,6 +93,8 @@ class IliasPage:
            return self._player_to_video()
        if self._is_video_listing():
            return self._find_video_entries()
        if self._is_exercise_file():
            return self._find_exercise_entries()
        return self._find_normal_entries()
    def _is_video_player(self) -> bool:
@@ -111,6 +113,19 @@ class IliasPage:
        )
        return video_element_table is not None
    def _is_exercise_file(self) -> bool:
        # we know it from before
        if self._page_type == IliasElementType.EXERCISE:
            return True
        # We have no suitable parent - let's guesss
        if self._soup.find(id="headerimage"):
            element: Tag = self._soup.find(id="headerimage")
            if "exc" in element.attrs["src"].lower():
                return True
        return False
    def _player_to_video(self) -> List[IliasPageElement]:
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
@@ -223,6 +238,40 @@ class IliasPage:
        return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
    def _find_exercise_entries(self) -> List[IliasPageElement]:
        results: List[IliasPageElement] = []
        # Each assignment is in an accordion container
        assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer")
        for container in assignment_containers:
            # Fetch the container name out of the header to use it in the path
            container_name = container.select_one(".ilAssignmentHeader").getText().strip()
            # Find all download links in the container (this will contain all the files)
            files: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
                attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x},
                text="Download"
            )
            # Grab each file as you now have the link
            for file_link in files:
                # Two divs, side by side. Left is the name, right is the link ==> get left
                # sibling
                file_name = file_link.parent.findPrevious(name="div").getText().strip()
                file_name = _sanitize_path_name(file_name)
                url = self._abs_url_from_link(file_link)
                results.append(IliasPageElement(
                    IliasElementType.FILE,
                    url,
                    container_name + "/" + file_name,
                    None  # We do not have any timestamp
                ))
        return results
    def _find_normal_entries(self) -> List[IliasPageElement]:
        result: List[IliasPageElement] = []