Fix download of links without a target URL

They are now downloaded as links to the empty url.
This commit is contained in:
I-Al-Istannen
2024-10-28 19:15:40 +01:00
parent 8fbd1978af
commit c1046498e7
2 changed files with 18 additions and 8 deletions

View File

@ -25,6 +25,7 @@ ambiguous situations.
### Fixed
- Personal desktop/dashboard/favorites crawling
- Crawling of nested courses
- Downloading of links with no target URL
## 3.6.0 - 2024-10-23

View File

@ -491,17 +491,26 @@ instance's greatest bottleneck.
self._write_link_content(link_template, element.url, element.name, element.description, sink)
async def _resolve_link_target(self, export_url: str) -> str:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip()
async def impl() -> Optional[str]:
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip()
# We are either unauthenticated or the link is not active
new_url = resp.headers[hdrs.LOCATION].lower()
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
return ""
return None
target = await impl()
if target is not None:
return target
await self._authenticate()
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip()
target = await impl()
if target is not None:
return target
raise CrawlError("resolve_link_target failed even after authenticating")