From 87b67e9271bd843397542aef75d75557762f641b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 29 Aug 2023 11:52:16 +0200 Subject: [PATCH] Crawl files in the info tab --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 44 +++++++++++++++++++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 16 +++++--- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e93f01..3c675f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ ambiguous situations. message. This combines nicely with the `no-delete-prompt-override` strategy, causing PFERD to mostly ignore local-only files. - support for mediacast video listings +- crawling of files in info tab ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index c0807d3..a8fcecb 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + INFO_TAB = "info_tab" LEARNING_MODULE = "learning_module" BOOKING = "booking" MEETING = "meeting" @@ -120,9 +121,25 @@ class IliasPage: if self._is_content_page(): log.explain("Page is a content page, searching for elements") return self._find_copa_entries() + if self._is_info_tab(): + log.explain("Page is info tab, searching for elements") + return self._find_info_tab_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_info_tab(self) -> Optional[IliasPageElement]: + tab: Optional[Tag] = self._soup.find( + name="a", + attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x} + ) + if tab is not None: + return IliasPageElement( + IliasElementType.INFO_TAB, + self._abs_url_from_link(tab), + "infos" + ) + return None + def get_description(self) -> Optional[BeautifulSoup]: def is_interesting_class(name: str) -> bool: return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] @@ -209,7 +226,11 @@ class IliasPage: log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() if not self._is_content_tab_selected(): - return self._select_content_page_url() + if self._page_type != IliasElementType.INFO_TAB: + log.explain("Selecting content tab") + return self._select_content_page_url() + else: + log.explain("Crawling info tab, skipping content select") return None def _is_forum_page(self) -> bool: @@ -281,6 +302,10 @@ class IliasPage: def _is_content_tab_selected(self) -> bool: return self._select_content_page_url() is None + def _is_info_tab(self) -> bool: + might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None + return self._page_type == IliasElementType.INFO_TAB and might_be_info + def _select_content_page_url(self) -> Optional[IliasPageElement]: tab = self._soup.find( id="tab_view_content", @@ -389,6 +414,23 @@ class IliasPage: return items + def _find_info_tab_entries(self) -> List[IliasPageElement]: + items = [] + links: List[Tag] = self._soup.select("a.il_ContainerItemCommand") + + for link in links: + if "cmdClass=ilobjcoursegui" not in link["href"]: + continue + if "cmd=sendfile" not in link["href"]: + continue + items.append(IliasPageElement( + IliasElementType.FILE, + self._abs_url_from_link(link), + _sanitize_path_name(link.getText()) + )) + + return items + def _find_opencast_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index eef3373..4f6cc74 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -85,6 +85,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, IliasElementType.EXERCISE_FILES, IliasElementType.FOLDER, + IliasElementType.INFO_TAB, IliasElementType.MEETING, IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO_FOLDER, @@ -262,6 +263,8 @@ instance's greatest bottleneck. next_stage_url = None elements.extend(page.get_child_elements()) + if info_tab := page.get_info_tab(): + elements.append(info_tab) if description_string := page.get_description(): description.append(description_string) @@ -705,7 +708,7 @@ instance's greatest bottleneck. log.explain(f"URL: {next_stage_url}") soup = await self._get_page(next_stage_url) - page = IliasPage(soup, next_stage_url, None) + page = IliasPage(soup, next_stage_url, element) if next := page.get_next_stage_element(): next_stage_url = next.url @@ -768,14 +771,14 @@ instance's greatest bottleneck. log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") log.explain(f"URL: {element.url}") soup = await self._get_page(element.url) - page = IliasPage(soup, element.url, None) + page = IliasPage(soup, element.url, element) if next := page.get_learning_module_data(): elements.extend(await self._crawl_learning_module_direction( - cl.path, next.previous_url, "left" + cl.path, next.previous_url, "left", element )) elements.append(next) elements.extend(await self._crawl_learning_module_direction( - cl.path, next.next_url, "right" + cl.path, next.next_url, "right", element )) # Reflect their natural ordering in the file names @@ -797,7 +800,8 @@ instance's greatest bottleneck. self, path: PurePath, start_url: Optional[str], - dir: Union[Literal["left"], Literal["right"]] + dir: Union[Literal["left"], Literal["right"]], + parent_element: IliasPageElement ) -> List[IliasLearningModulePage]: elements: List[IliasLearningModulePage] = [] @@ -810,7 +814,7 @@ instance's greatest bottleneck. log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") log.explain(f"URL: {next_element_url}") soup = await self._get_page(next_element_url) - page = IliasPage(soup, next_element_url, None) + page = IliasPage(soup, next_element_url, parent_element) if next := page.get_learning_module_data(): elements.append(next) if dir == "left":