diff --git a/CHANGELOG.md b/CHANGELOG.md index 132351b..41ee3d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,10 @@ ambiguous situations. ## Unreleased +### Fixed +- Shibboleth login fixed. It was broken due to URL parser changes and really + *unfortunate* behaviour by aiohttp. + ## 3.3.0 - 2022-01-09 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c3b2342..c26ce8b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -4,6 +4,7 @@ from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast import aiohttp +import yarl from aiohttp import hdrs from bs4 import BeautifulSoup, Tag @@ -674,14 +675,14 @@ class KitShibbolethLogin: # Equivalent: Click on "Mit KIT-Account anmelden" button in # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + url = "https://ilias.studium.kit.edu/shib_login.php" data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", - "target": "/shib_login.php", - "home_organization_selection": "Mit KIT-Account anmelden", + "il_target": "", + "home_organization_selection": "Weiter", } - soup: BeautifulSoup = await _post(sess, url, data) + soup: BeautifulSoup = await _shib_post(sess, url, data) # Attempt to login using credentials, if necessary while not self._login_successful(soup): @@ -761,3 +762,33 @@ class KitShibbolethLogin: async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: async with session.post(url, data=data) as response: return soupify(await response.read()) + + +async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: + """ + aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected + by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and + build encoded URL objects ourselfs... Who thought mangling location header was a good idea?? + """ + async with session.post(url, data=data, allow_redirects=False) as response: + location = response.headers.get("location") + if not location: + raise CrawlWarning(f"Login failed, no location header present at {url}") + correct_url = yarl.URL(location, encoded=True) + + async with session.get(correct_url, allow_redirects=False) as response: + as_yarl = yarl.URL(response.url) + location = response.headers.get("location") + + if not location or not as_yarl.host: + raise CrawlWarning(f"Login failed, no location header present at {correct_url}") + + correct_url = yarl.URL.build( + scheme=as_yarl.scheme, + host=as_yarl.host, + path=location, + encoded=True + ) + + async with session.get(correct_url, allow_redirects=False) as response: + return soupify(await response.read())