Handle exercise detail containers in ILIAS html parser

2026-01-09 05:52:30 +01:00 · 2021-05-24 16:22:51 +02:00
parent d44f6966c2
commit 342076ee0e
2 changed files with 62 additions and 4 deletions
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -16,6 +16,7 @@ TargetType = Union[str, int]
 class IliasElementType(Enum):
    EXERCISE = "exercise"
    EXERCISE_FILES = "exercise_files"  # own submitted files
    FILE = "file"
    FOLDER = "folder"
    FORUM = "forum"
@@ -197,6 +198,43 @@ class IliasPage:
        return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
    def _find_exercise_entries(self) -> List[IliasPageElement]:
        if self._soup.find(id="tab_submission"):
            log.explain("Found submission tab. This is an exercise detail page")
            return self._find_exercise_entries_detail_page()
        log.explain("Found no submission tab. This is an exercise root page")
        return self._find_exercise_entries_root_page()
    def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]:
        results: List[IliasPageElement] = []
        # Find all download links in the container (this will contain all the files)
        download_links: List[Tag] = self._soup.findAll(
            name="a",
            # download links contain the given command class
            attrs={"href": lambda x: x and "cmd=download" in x},
            text="Download"
        )
        for link in download_links:
            parent_row: Tag = link.findParent("tr")
            children: List[Tag] = parent_row.findChildren("td")
            # <checkbox> <name> <uploader> <date> <download>
            #     0         1        2       3        4
            name = _sanitize_path_name(children[1].getText().strip())
            date = demangle_date(children[3].getText().strip())
            log.explain(f"Found exercise detail entry {name!r}")
            results.append(IliasPageElement(
                IliasElementType.FILE,
                self._abs_url_from_link(link),
                name,
                date
            ))
        return results
    def _find_exercise_entries_root_page(self) -> List[IliasPageElement]:
        results: List[IliasPageElement] = []
        # Each assignment is in an accordion container
@@ -205,6 +243,8 @@ class IliasPage:
        for container in assignment_containers:
            # Fetch the container name out of the header to use it in the path
            container_name = container.select_one(".ilAssignmentHeader").getText().strip()
            log.explain(f"Found exercise container {container_name!r}")
            # Find all download links in the container (this will contain all the files)
            files: List[Tag] = container.findAll(
                name="a",
@@ -213,8 +253,6 @@ class IliasPage:
                text="Download"
            )
            log.explain(f"Found exercise container {container_name!r}")
            # Grab each file as you now have the link
            for file_link in files:
                # Two divs, side by side. Left is the name, right is the link ==> get left
@@ -231,6 +269,25 @@ class IliasPage:
                    None  # We do not have any timestamp
                ))
            # Find all links to file listings (e.g. "Submitted Files" for groups)
            file_listings: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
                attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
            )
            # Add each listing as a new
            for listing in file_listings:
                file_name = _sanitize_path_name(listing.getText().strip())
                url = self._abs_url_from_link(listing)
                log.explain(f"Found exercise detail {file_name!r} at {url}")
                results.append(IliasPageElement(
                    IliasElementType.EXERCISE_FILES,
                    url,
                    container_name + "/" + file_name,
                    None  # we do not have any timestamp
                ))
        return results
    def _find_normal_entries(self) -> List[IliasPageElement]:
@@ -349,7 +406,7 @@ class IliasPage:
        if found_parent is None:
            _unexpected_html_warning()
-            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url!r}")
+            log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}")
            return None
        # Find the small descriptive icon to figure out the type
@@ -357,7 +414,7 @@ class IliasPage:
        if img_tag is None:
            _unexpected_html_warning()
-            log.warn_contd(f"Tried to figure out element type, but did not find an image for {url!r}")
+            log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
            return None
        if "opencast" in str(img_tag["alt"]).lower():
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -61,6 +61,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
 _DIRECTORY_PAGES: Set[IliasElementType] = set([
    IliasElementType.EXERCISE,
    IliasElementType.EXERCISE_FILES,
    IliasElementType.FOLDER,
    IliasElementType.MEETING,
    IliasElementType.VIDEO_FOLDER,