From c54c3bcfa157631af1d55a210b60ad3bfc64f972 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 27 Oct 2024 10:50:59 +0100 Subject: [PATCH] Fix crawling of favorites --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/ilias_web_crawler.py | 9 +++------ PFERD/crawl/ilias/kit_ilias_html.py | 9 +++++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 573cad9..ce20269 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Personal desktop/dashboard/favorites crawling + ## 3.6.0 - 2024-10-23 ### Added diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index b77f4fc..a566ce5 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -185,12 +185,9 @@ instance's greatest bottleneck. await self._crawl_url(root_url, expected_id=course_id) async def _crawl_desktop(self) -> None: - appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items" - appendix = appendix.encode("ASCII").hex() - await self._crawl_url(url_set_query_param( - urljoin(self._base_url, "/gs_content.php"), - "item=", appendix, - )) + await self._crawl_url( + urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show") + ) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: maybe_cl = await self.crawl(PurePath(".")) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 34e02ba..98b32c3 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -322,7 +322,7 @@ class IliasPage: return False def _is_personal_desktop(self) -> bool: - return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + return "baseclass=ildashboardgui" in self._page_url.lower() and "&cmd=show" in self._page_url.lower() def _is_content_page(self) -> bool: if link := self.get_permalink(): @@ -427,9 +427,14 @@ class IliasPage: def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] - titles: List[Tag] = self._soup.select(".il-item-title") + titles: List[Tag] = self._soup.select("#block_pditems_0 .il-item-title") for title in titles: link = title.find("a") + + if not link: + log.explain(f"Skipping offline item: {title.getText().strip()!r}") + continue + name = _sanitize_path_name(link.text.strip()) url = self._abs_url_from_link(link)