From 4ee919625da8d3d04cbb889e24d05b1c09436fe8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:47:35 +0100 Subject: [PATCH] Add rudimentary support for content pages --- PFERD/crawl/ilias/kit_ilias_html.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index cee0555..754af16 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -77,8 +77,11 @@ class IliasPage: log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() if self._is_personal_desktop(): - log.explain("Page is the personal desktop") + log.explain("Page is the personal desktop, searching for elements") return self._find_personal_desktop_entries() + if self._is_content_page(): + log.explain("Page is a content page, searching for elements") + return self._find_copa_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() @@ -126,6 +129,12 @@ class IliasPage: def _is_personal_desktop(self) -> bool: return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + def _is_content_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=copa_" in link.get("value") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -185,6 +194,23 @@ class IliasPage: return items + def _find_copa_entries(self) -> List[IliasPageElement]: + items: List[IliasPageElement] = [] + links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") + + for link in links: + url = self._abs_url_from_link(link) + name = _sanitize_path_name(link.getText().strip().replace("\t", "")) + + if "file_id" not in url: + _unexpected_html_warning() + log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}") + continue + + items.append(IliasPageElement(IliasElementType.FILE, url, name)) + + return items + def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing