|
|
|
@ -4,6 +4,7 @@ from pathlib import PurePath
|
|
|
|
|
from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast
|
|
|
|
|
|
|
|
|
|
import aiohttp
|
|
|
|
|
import yarl
|
|
|
|
|
from aiohttp import hdrs
|
|
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
|
|
|
|
|
@ -498,7 +499,7 @@ instance's greatest bottleneck.
|
|
|
|
|
log.explain_topic(f"Checking local cache for video {video_path.name}")
|
|
|
|
|
all_found_locally = True
|
|
|
|
|
for video in contained_videos:
|
|
|
|
|
transformed_path = self._transformer.transform(video)
|
|
|
|
|
transformed_path = self._to_local_video_path(video)
|
|
|
|
|
if transformed_path:
|
|
|
|
|
exists_locally = self._output_dir.resolve(transformed_path).exists()
|
|
|
|
|
all_found_locally = all_found_locally and exists_locally
|
|
|
|
@ -508,6 +509,11 @@ instance's greatest bottleneck.
|
|
|
|
|
log.explain("Missing at least one video, continuing with requests!")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]:
|
|
|
|
|
if transformed := self._transformer.transform(path):
|
|
|
|
|
return self._deduplicator.fixup_path(transformed)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
@anoncritical
|
|
|
|
|
@_iorepeat(3, "downloading video")
|
|
|
|
|
async def _download_video(
|
|
|
|
@ -527,7 +533,7 @@ instance's greatest bottleneck.
|
|
|
|
|
log.explain(f"Using single video mode for {element.name}")
|
|
|
|
|
stream_element = stream_elements[0]
|
|
|
|
|
|
|
|
|
|
transformed_path = self._transformer.transform(original_path)
|
|
|
|
|
transformed_path = self._to_local_video_path(original_path)
|
|
|
|
|
if not transformed_path:
|
|
|
|
|
raise CrawlError(f"Download returned a path but transform did not for {original_path}")
|
|
|
|
|
|
|
|
|
@ -674,14 +680,14 @@ class KitShibbolethLogin:
|
|
|
|
|
|
|
|
|
|
# Equivalent: Click on "Mit KIT-Account anmelden" button in
|
|
|
|
|
# https://ilias.studium.kit.edu/login.php
|
|
|
|
|
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
|
|
|
|
|
url = "https://ilias.studium.kit.edu/shib_login.php"
|
|
|
|
|
data = {
|
|
|
|
|
"sendLogin": "1",
|
|
|
|
|
"idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
|
|
|
|
|
"target": "/shib_login.php",
|
|
|
|
|
"home_organization_selection": "Mit KIT-Account anmelden",
|
|
|
|
|
"il_target": "",
|
|
|
|
|
"home_organization_selection": "Weiter",
|
|
|
|
|
}
|
|
|
|
|
soup: BeautifulSoup = await _post(sess, url, data)
|
|
|
|
|
soup: BeautifulSoup = await _shib_post(sess, url, data)
|
|
|
|
|
|
|
|
|
|
# Attempt to login using credentials, if necessary
|
|
|
|
|
while not self._login_successful(soup):
|
|
|
|
@ -761,3 +767,33 @@ class KitShibbolethLogin:
|
|
|
|
|
async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
|
|
|
|
|
async with session.post(url, data=data) as response:
|
|
|
|
|
return soupify(await response.read())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
|
|
|
|
|
"""
|
|
|
|
|
aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected
|
|
|
|
|
by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and
|
|
|
|
|
build encoded URL objects ourselves... Who thought mangling location header was a good idea??
|
|
|
|
|
"""
|
|
|
|
|
async with session.post(url, data=data, allow_redirects=False) as response:
|
|
|
|
|
location = response.headers.get("location")
|
|
|
|
|
if not location:
|
|
|
|
|
raise CrawlWarning(f"Login failed, no location header present at {url}")
|
|
|
|
|
correct_url = yarl.URL(location, encoded=True)
|
|
|
|
|
|
|
|
|
|
async with session.get(correct_url, allow_redirects=False) as response:
|
|
|
|
|
as_yarl = yarl.URL(response.url)
|
|
|
|
|
location = response.headers.get("location")
|
|
|
|
|
|
|
|
|
|
if not location or not as_yarl.host:
|
|
|
|
|
raise CrawlWarning(f"Login failed, no location header present at {correct_url}")
|
|
|
|
|
|
|
|
|
|
correct_url = yarl.URL.build(
|
|
|
|
|
scheme=as_yarl.scheme,
|
|
|
|
|
host=as_yarl.host,
|
|
|
|
|
path=location,
|
|
|
|
|
encoded=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async with session.get(correct_url, allow_redirects=False) as response:
|
|
|
|
|
return soupify(await response.read())
|
|
|
|
|