mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-07-12 22:22:30 +02:00
Fix download of links without a target URL
They are now downloaded as links to the empty url.
This commit is contained in:
@ -25,6 +25,7 @@ ambiguous situations.
|
|||||||
### Fixed
|
### Fixed
|
||||||
- Personal desktop/dashboard/favorites crawling
|
- Personal desktop/dashboard/favorites crawling
|
||||||
- Crawling of nested courses
|
- Crawling of nested courses
|
||||||
|
- Downloading of links with no target URL
|
||||||
|
|
||||||
## 3.6.0 - 2024-10-23
|
## 3.6.0 - 2024-10-23
|
||||||
|
|
||||||
|
@ -491,17 +491,26 @@ instance's greatest bottleneck.
|
|||||||
self._write_link_content(link_template, element.url, element.name, element.description, sink)
|
self._write_link_content(link_template, element.url, element.name, element.description, sink)
|
||||||
|
|
||||||
async def _resolve_link_target(self, export_url: str) -> str:
|
async def _resolve_link_target(self, export_url: str) -> str:
|
||||||
|
async def impl() -> Optional[str]:
|
||||||
async with self.session.get(export_url, allow_redirects=False) as resp:
|
async with self.session.get(export_url, allow_redirects=False) as resp:
|
||||||
# No redirect means we were authenticated
|
# No redirect means we were authenticated
|
||||||
if hdrs.LOCATION not in resp.headers:
|
if hdrs.LOCATION not in resp.headers:
|
||||||
return soupify(await resp.read()).select_one("a").get("href").strip()
|
return soupify(await resp.read()).select_one("a").get("href").strip()
|
||||||
|
# We are either unauthenticated or the link is not active
|
||||||
|
new_url = resp.headers[hdrs.LOCATION].lower()
|
||||||
|
if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url:
|
||||||
|
return ""
|
||||||
|
return None
|
||||||
|
|
||||||
|
target = await impl()
|
||||||
|
if target is not None:
|
||||||
|
return target
|
||||||
|
|
||||||
await self._authenticate()
|
await self._authenticate()
|
||||||
|
|
||||||
async with self.session.get(export_url, allow_redirects=False) as resp:
|
target = await impl()
|
||||||
# No redirect means we were authenticated
|
if target is not None:
|
||||||
if hdrs.LOCATION not in resp.headers:
|
return target
|
||||||
return soupify(await resp.read()).select_one("a").get("href").strip()
|
|
||||||
|
|
||||||
raise CrawlError("resolve_link_target failed even after authenticating")
|
raise CrawlError("resolve_link_target failed even after authenticating")
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user