Crawl files in the info tab

2026-02-22 00:22:24 +01:00 · 2023-08-29 11:52:16 +02:00
parent b54b3b979c
commit 87b67e9271
3 changed files with 54 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,7 @@ ambiguous situations.
  message. This combines nicely with the `no-delete-prompt-override` strategy,
  causing PFERD to mostly ignore local-only files.
 - support for mediacast video listings
 - crawling of files in info tab
 ## 3.4.3 - 2022-11-29
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -22,6 +22,7 @@ class IliasElementType(Enum):
    FOLDER = "folder"
    FORUM = "forum"
    LINK = "link"
    INFO_TAB = "info_tab"
    LEARNING_MODULE = "learning_module"
    BOOKING = "booking"
    MEETING = "meeting"
@@ -120,9 +121,25 @@ class IliasPage:
        if self._is_content_page():
            log.explain("Page is a content page, searching for elements")
            return self._find_copa_entries()
        if self._is_info_tab():
            log.explain("Page is info tab, searching for elements")
            return self._find_info_tab_entries()
        log.explain("Page is a normal folder, searching for elements")
        return self._find_normal_entries()
    def get_info_tab(self) -> Optional[IliasPageElement]:
        tab: Optional[Tag] = self._soup.find(
            name="a",
            attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x}
        )
        if tab is not None:
            return IliasPageElement(
                IliasElementType.INFO_TAB,
                self._abs_url_from_link(tab),
                "infos"
            )
        return None
    def get_description(self) -> Optional[BeautifulSoup]:
        def is_interesting_class(name: str) -> bool:
            return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
@@ -209,7 +226,11 @@ class IliasPage:
            log.explain("Requesting *all* future meetings")
            return self._uncollapse_future_meetings_url()
        if not self._is_content_tab_selected():
            if self._page_type != IliasElementType.INFO_TAB:
                log.explain("Selecting content tab")
                return self._select_content_page_url()
            else:
                log.explain("Crawling info tab, skipping content select")
        return None
    def _is_forum_page(self) -> bool:
@@ -281,6 +302,10 @@ class IliasPage:
    def _is_content_tab_selected(self) -> bool:
        return self._select_content_page_url() is None
    def _is_info_tab(self) -> bool:
        might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None
        return self._page_type == IliasElementType.INFO_TAB and might_be_info
    def _select_content_page_url(self) -> Optional[IliasPageElement]:
        tab = self._soup.find(
            id="tab_view_content",
@@ -389,6 +414,23 @@ class IliasPage:
        return items
    def _find_info_tab_entries(self) -> List[IliasPageElement]:
        items = []
        links: List[Tag] = self._soup.select("a.il_ContainerItemCommand")
        for link in links:
            if "cmdClass=ilobjcoursegui" not in link["href"]:
                continue
            if "cmd=sendfile" not in link["href"]:
                continue
            items.append(IliasPageElement(
                IliasElementType.FILE,
                self._abs_url_from_link(link),
                _sanitize_path_name(link.getText())
            ))
        return items
    def _find_opencast_video_entries(self) -> List[IliasPageElement]:
        # ILIAS has three stages for video pages
        # 1. The initial dummy page without any videos. This page contains the link to the listing
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -85,6 +85,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
    IliasElementType.EXERCISE,
    IliasElementType.EXERCISE_FILES,
    IliasElementType.FOLDER,
    IliasElementType.INFO_TAB,
    IliasElementType.MEETING,
    IliasElementType.MEDIACAST_VIDEO_FOLDER,
    IliasElementType.OPENCAST_VIDEO_FOLDER,
@@ -262,6 +263,8 @@ instance's greatest bottleneck.
                        next_stage_url = None
                elements.extend(page.get_child_elements())
                if info_tab := page.get_info_tab():
                    elements.append(info_tab)
                if description_string := page.get_description():
                    description.append(description_string)
@@ -705,7 +708,7 @@ instance's greatest bottleneck.
                log.explain(f"URL: {next_stage_url}")
                soup = await self._get_page(next_stage_url)
-                page = IliasPage(soup, next_stage_url, None)
+                page = IliasPage(soup, next_stage_url, element)
                if next := page.get_next_stage_element():
                    next_stage_url = next.url
@@ -768,14 +771,14 @@ instance's greatest bottleneck.
            log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
            log.explain(f"URL: {element.url}")
            soup = await self._get_page(element.url)
-            page = IliasPage(soup, element.url, None)
+            page = IliasPage(soup, element.url, element)
            if next := page.get_learning_module_data():
                elements.extend(await self._crawl_learning_module_direction(
-                    cl.path, next.previous_url, "left"
+                    cl.path, next.previous_url, "left", element
                ))
                elements.append(next)
                elements.extend(await self._crawl_learning_module_direction(
-                    cl.path, next.next_url, "right"
+                    cl.path, next.next_url, "right", element
                ))
        # Reflect their natural ordering in the file names
@@ -797,7 +800,8 @@ instance's greatest bottleneck.
        self,
        path: PurePath,
        start_url: Optional[str],
-        dir: Union[Literal["left"], Literal["right"]]
+        dir: Union[Literal["left"], Literal["right"]],
        parent_element: IliasPageElement
    ) -> List[IliasLearningModulePage]:
        elements: List[IliasLearningModulePage] = []
@@ -810,7 +814,7 @@ instance's greatest bottleneck.
            log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
            log.explain(f"URL: {next_element_url}")
            soup = await self._get_page(next_element_url)
-            page = IliasPage(soup, next_element_url, None)
+            page = IliasPage(soup, next_element_url, parent_element)
            if next := page.get_learning_module_data():
                elements.append(next)
                if dir == "left":