Fix download of links without a target URL

They are now downloaded as links to the empty url.
2025-07-12 14:12:30 +02:00 · 2024-10-28 19:15:40 +01:00
parent 8fbd1978af
commit c1046498e7
2 changed files with 18 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -25,6 +25,7 @@ ambiguous situations.
 ### Fixed
 - Personal desktop/dashboard/favorites crawling
 - Crawling of nested courses
+- Downloading of links with no target URL

 ## 3.6.0 - 2024-10-23

--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@ -491,17 +491,26 @@ instance's greatest bottleneck.
            self._write_link_content(link_template, element.url, element.name, element.description, sink)

    async def _resolve_link_target(self, export_url: str) -> str:
-        async with self.session.get(export_url, allow_redirects=False) as resp:
-            # No redirect means we were authenticated
-            if hdrs.LOCATION not in resp.headers:
-                return soupify(await resp.read()).select_one("a").get("href").strip()
+        async def impl() -> Optional[str]:
+            async with self.session.get(export_url, allow_redirects=False) as resp:
+                # No redirect means we were authenticated
+                if hdrs.LOCATION not in resp.headers:
+                    return soupify(await resp.read()).select_one("a").get("href").strip()
+                # We are either unauthenticated or the link is not active
+                new_url = resp.headers[hdrs.LOCATION].lower()
+                if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
+                    return ""
+                return None
+
+        target = await impl()
+        if target is not None:
+            return target

        await self._authenticate()

-        async with self.session.get(export_url, allow_redirects=False) as resp:
-            # No redirect means we were authenticated
-            if hdrs.LOCATION not in resp.headers:
-                return soupify(await resp.read()).select_one("a").get("href").strip()
+        target = await impl()
+        if target is not None:
+            return target

        raise CrawlError("resolve_link_target failed even after authenticating")