Compare commits

..

4 Commits

Author SHA1 Message Date
86947e4874 Bump version to 3.3.1 2022-01-15 15:11:22 +01:00
4f022e2d19 Reword changelog 2022-01-15 15:06:02 +01:00
f47e7374d2 Use fixed windows path for video cache 2022-01-15 12:00:30 +01:00
57ec51e95a Fix login after shib url parser change 2022-01-14 20:17:27 +01:00
4 changed files with 55 additions and 7 deletions

View File

@ -22,6 +22,12 @@ ambiguous situations.
## Unreleased
## 3.3.1 - 2022-01-15
### Fixed
- ILIAS login
- Local video cache if `windows_paths` is enabled
## 3.3.0 - 2022-01-09
### Added

View File

@ -4,6 +4,7 @@ from pathlib import PurePath
from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast
import aiohttp
import yarl
from aiohttp import hdrs
from bs4 import BeautifulSoup, Tag
@ -498,7 +499,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Checking local cache for video {video_path.name}")
all_found_locally = True
for video in contained_videos:
transformed_path = self._transformer.transform(video)
transformed_path = self._to_local_video_path(video)
if transformed_path:
exists_locally = self._output_dir.resolve(transformed_path).exists()
all_found_locally = all_found_locally and exists_locally
@ -508,6 +509,11 @@ instance's greatest bottleneck.
log.explain("Missing at least one video, continuing with requests!")
return False
def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]:
if transformed := self._transformer.transform(path):
return self._deduplicator.fixup_path(transformed)
return None
@anoncritical
@_iorepeat(3, "downloading video")
async def _download_video(
@ -527,7 +533,7 @@ instance's greatest bottleneck.
log.explain(f"Using single video mode for {element.name}")
stream_element = stream_elements[0]
transformed_path = self._transformer.transform(original_path)
transformed_path = self._to_local_video_path(original_path)
if not transformed_path:
raise CrawlError(f"Download returned a path but transform did not for {original_path}")
@ -674,14 +680,14 @@ class KitShibbolethLogin:
# Equivalent: Click on "Mit KIT-Account anmelden" button in
# https://ilias.studium.kit.edu/login.php
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
url = "https://ilias.studium.kit.edu/shib_login.php"
data = {
"sendLogin": "1",
"idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
"target": "/shib_login.php",
"home_organization_selection": "Mit KIT-Account anmelden",
"il_target": "",
"home_organization_selection": "Weiter",
}
soup: BeautifulSoup = await _post(sess, url, data)
soup: BeautifulSoup = await _shib_post(sess, url, data)
# Attempt to login using credentials, if necessary
while not self._login_successful(soup):
@ -761,3 +767,33 @@ class KitShibbolethLogin:
async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
async with session.post(url, data=data) as response:
return soupify(await response.read())
async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
"""
aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected
by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and
build encoded URL objects ourselves... Who thought mangling location header was a good idea??
"""
async with session.post(url, data=data, allow_redirects=False) as response:
location = response.headers.get("location")
if not location:
raise CrawlWarning(f"Login failed, no location header present at {url}")
correct_url = yarl.URL(location, encoded=True)
async with session.get(correct_url, allow_redirects=False) as response:
as_yarl = yarl.URL(response.url)
location = response.headers.get("location")
if not location or not as_yarl.host:
raise CrawlWarning(f"Login failed, no location header present at {correct_url}")
correct_url = yarl.URL.build(
scheme=as_yarl.scheme,
host=as_yarl.host,
path=location,
encoded=True
)
async with session.get(correct_url, allow_redirects=False) as response:
return soupify(await response.read())

View File

@ -56,6 +56,12 @@ class Deduplicator:
log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility")
return new_path
def fixup_path(self, path: PurePath) -> PurePath:
"""Fixes up the path for windows, if enabled. Returns the path unchanged otherwise."""
if self._windows_paths:
return self._fixup_for_windows(path)
return path
def mark(self, path: PurePath) -> PurePath:
if self._windows_paths:
path = self._fixup_for_windows(path)

View File

@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.3.0"
VERSION = "3.3.1"