Fix personal desktop crawling

2025-07-20 01:42:37 +02:00 · 2022-01-08 16:58:15 +01:00
parent 462d993fbc
commit 6f3cfd4396
2 changed files with 66 additions and 4 deletions
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -39,7 +39,12 @@ class IliasPageElement:
    description: Optional[str] = None

    def id(self) -> str:
-        regexes = [r"eid=(?P<id>[0-9a-z\-]+)", r"file_(?P<id>\d+)", r"ref_id=(?P<id>\d+)"]
+        regexes = [
+            r"eid=(?P<id>[0-9a-z\-]+)",
+            r"file_(?P<id>\d+)",
+            r"ref_id=(?P<id>\d+)",
+            r"target=[a-z]+_(?P<id>\d+)"
+        ]

        for regex in regexes:
            if match := re.search(regex, self.url):
@@ -71,6 +76,9 @@ class IliasPage:
        if self._is_exercise_file():
            log.explain("Page is an exercise, searching for elements")
            return self._find_exercise_entries()
+        if self._is_personal_desktop():
+            log.explain("Page is the personal desktop")
+            return self._find_personal_desktop_entries()
        log.explain("Page is a normal folder, searching for elements")
        return self._find_normal_entries()

@@ -115,6 +123,9 @@ class IliasPage:

        return False

+    def _is_personal_desktop(self) -> bool:
+        return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
+
    def _player_to_video(self) -> List[IliasPageElement]:
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
@@ -149,6 +160,26 @@ class IliasPage:

        return items

+    def _find_personal_desktop_entries(self) -> List[IliasPageElement]:
+        items: List[IliasPageElement] = []
+
+        titles: List[Tag] = self._soup.select(".il-item-title")
+        for title in titles:
+            link = title.find("a")
+            name = _sanitize_path_name(link.text.strip())
+            url = self._abs_url_from_link(link)
+
+            type = self._find_type_from_link(name, link, url)
+            if not type:
+                _unexpected_html_warning()
+                log.warn_contd(f"Could not extract type for {link}")
+                continue
+
+            log.explain(f"Found {name!r}")
+            items.append(IliasPageElement(type, url, name))
+
+        return items
+
    def _find_video_entries(self) -> List[IliasPageElement]:
        # ILIAS has three stages for video pages
        # 1. The initial dummy page without any videos. This page contains the link to the listing
@@ -551,9 +582,30 @@ class IliasPage:
        if "target=file_" in parsed_url.query:
            return IliasElementType.FILE

+        if "target=grp_" in parsed_url.query:
+            return IliasElementType.FOLDER
+
+        if "target=crs_" in parsed_url.query:
+            return IliasElementType.FOLDER
+
+        if "baseClass=ilExerciseHandlerGUI" in parsed_url.query:
+            return IliasElementType.EXERCISE
+
+        if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query:
+            return IliasElementType.LINK
+
+        if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query:
+            return IliasElementType.FORUM
+
+        if "cmdClass=ilobjtestgui" in parsed_url.query:
+            return IliasElementType.TEST
+
+        # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
+        # try to guess it from the image.
+
        # Everything with a ref_id can *probably* be opened to reveal nested things
        # video groups, directories, exercises, etc
-        if "ref_id=" in parsed_url.query:
+        if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path:
            return IliasPage._find_type_from_folder_like(link_element, url)

        _unexpected_html_warning()
@@ -574,7 +626,7 @@ class IliasPage:
        # We look for the outer div of our inner link, to find information around it
        # (mostly the icon)
        for parent in link_element.parents:
-            if "ilContainerListItemOuter" in parent["class"]:
+            if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]:
                found_parent = parent
                break

@@ -586,6 +638,9 @@ class IliasPage:
        # Find the small descriptive icon to figure out the type
        img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon")

+        if img_tag is None:
+            img_tag = found_parent.select_one("img.icon")
+
        if img_tag is None:
            _unexpected_html_warning()
            log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -203,7 +203,9 @@ instance's greatest bottleneck.
        await self._crawl_url(root_url, expected_id=course_id)

    async def _crawl_desktop(self) -> None:
-        await self._crawl_url(self._base_url)
+        appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
+        appendix = appendix.encode("ASCII").hex()
+        await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)

    async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
        maybe_cl = await self.crawl(PurePath("."))
@@ -622,6 +624,11 @@ instance's greatest bottleneck.
        if mainbar is not None:
            login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x})
            return not login_button
+
+        # Personal Desktop
+        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
+            return True
+
        # Video listing embeds do not have complete ILIAS html. Try to match them by
        # their video listing table
        video_table = soup.find(