diff --git a/.gitignore b/.gitignore index 04964e5..bb72d15 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +**/__pycache__/ bin/ include/ lib/ diff --git a/PFERD/ReadWriteLock.py b/PFERD/ReadWriteLock.py new file mode 100644 index 0000000..1d3e767 --- /dev/null +++ b/PFERD/ReadWriteLock.py @@ -0,0 +1,80 @@ +# From https://charemza.name/blog/posts/python/asyncio/read-write-lock/ +# https://gist.github.com/michalc/ab9bd571cfab09216c0316f2302a76b0#file-asyncio_read_write_lock-py + +import asyncio +import collections +import contextlib + + +class _ReadWaiter(asyncio.Future): + pass + +class _WriteWaiter(asyncio.Future): + pass + +class ReadWriteLock(): + + def __init__(self): + self._waiters = collections.deque() + self._reads_held = 0 + self._write_held = False + + def _pop_queued_waiters(self, waiter_type): + while True: + correct_type = self._waiters and isinstance(self._waiters[0], waiter_type) + cancelled = self._waiters and self._waiters[0].cancelled() + + if correct_type or cancelled: + waiter = self._waiters.popleft() + + if correct_type and not cancelled: + yield waiter + + if not correct_type and not cancelled: + break + + def _resolve_queued_waiters(self): + if not self._write_held: + for waiter in self._pop_queued_waiters(_ReadWaiter): + self._reads_held += 1 + waiter.set_result(None) + + if not self._write_held and not self._reads_held: + for waiter in self._pop_queued_waiters(_WriteWaiter): + self._write_held = True + waiter.set_result(None) + break + + def _on_read_release(self): + self._reads_held -= 1 + + def _on_write_release(self): + self._write_held = False + + @contextlib.asynccontextmanager + async def _acquire(self, waiter_type, on_release): + waiter = waiter_type() + self._waiters.append(waiter) + self._resolve_queued_waiters() + + try: + await waiter + except asyncio.CancelledError: + self._resolve_queued_waiters() + raise + + try: + yield + finally: + on_release() + self._resolve_queued_waiters() + + @contextlib.asynccontextmanager + async def read(self): + async with self._acquire(_ReadWaiter, self._on_read_release): + yield + + @contextlib.asynccontextmanager + async def write(self): + async with self._acquire(_WriteWaiter, self._on_write_release): + yield diff --git a/PFERD/__init__.py b/PFERD/__init__.py new file mode 100644 index 0000000..960cc7a --- /dev/null +++ b/PFERD/__init__.py @@ -0,0 +1,7 @@ +from .authenticator import * + +__all__ = ( + authenticator.__all__ +) + +LOG_FORMAT = "[%(levelname)s] %(message)s" diff --git a/PFERD/authenticator.py b/PFERD/authenticator.py new file mode 100644 index 0000000..3876976 --- /dev/null +++ b/PFERD/authenticator.py @@ -0,0 +1,174 @@ +import aiohttp +import asyncio +import bs4 +import getpass +import logging +import time +import urllib.parse + +from .ReadWriteLock import ReadWriteLock + +__all__ = [ + "OutOfTriesException", + "ShibbolethAuthenticator", +] +logger = logging.getLogger(__name__) + +class OutOfTriesException(Exception): + pass + +class ShibbolethAuthenticator: + + RETRY_ATTEMPTS = 5 + RETRY_DELAY = 1 # seconds + + def __init__(self, cookie_path=None): + self._cookie_path = cookie_path + + # Authentication and file/page download should not happen at the same time. + # Authenticating counts as writing, file/page downloads as reading. + self._lock = ReadWriteLock() + + # Only one self._authenticate() should be started, even if multiple self.get_page()s + # notice they're logged in. + # If self._event is not None, authenticating is currently in progress. + self._event = None + + jar = aiohttp.CookieJar() + if self._cookie_path is not None: + try: + jar.load(self._cookie_path) + except FileNotFoundError: + pass + self._session = aiohttp.ClientSession(cookie_jar=jar) + + async def close(self): + await self._session.close() + + async def _post(self, url, params=None, data=None): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.post(url, params=params, data=data) as resp: + text = await resp.text() + return resp.url, text + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) + + logger.error("Could not retrieve url") + raise OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + + async def _get(self, url, params=None): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url, params=params) as resp: + text = await resp.text() + return resp.url, text + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) + + logger.error("Could not retrieve url") + raise OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + + def _login_successful(self, soup): + saml_response = soup.find("input", {"name": "SAMLResponse"}) + relay_state = soup.find("input", {"name": "RelayState"}) + return saml_response is not None and relay_state is not None + + def _save_cookies(self): + logger.info(f"Saving cookies to {self._cookie_path!r}") + if self._cookie_path is not None: + self._session.cookie_jar.save(self._cookie_path) + + # WARNING: Only use self._ensure_authenticated() to authenticate, + # don't call self._authenticate() itself. + async def _authenticate(self): + async with self._lock.write(): + # Equivalent: Click on "Mit KIT-Account anmelden" button in + # https://ilias.studium.kit.edu/login.php + url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + data = { + "sendLogin": "1", + "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", + "target": "/shib_login.php", + "home_organization_selection": "Mit KIT-Account anmelden", + } + logger.debug("Begin authentication process with ILIAS") + url, text = await self._post(url, data=data) + soup = bs4.BeautifulSoup(text, "html.parser") + + # Attempt to login using credentials, if necessary + while not self._login_successful(soup): + form = soup.find("form", {"class": "form2", "method": "post"}) + action = form["action"] + + print("Please enter Shibboleth credentials.") + username = getpass.getpass(prompt="Username: ") + password = getpass.getpass(prompt="Password: ") + + # Equivalent: Enter credentials in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + url = "https://idp.scc.kit.edu" + action + data = { + "_eventId_proceed": "", + "j_username": username, + "j_password": password, + } + logger.debug("Attempt to log in to Shibboleth using credentials") + url, text = await self._post(url, data=data) + soup = bs4.BeautifulSoup(text, "html.parser") + + if not self._login_successful(soup): + print("Incorrect credentials.") + + # Saving progress: Successfully authenticated with Shibboleth + self._save_cookies() + + relay_state = soup.find("input", {"name": "RelayState"})["value"] + saml_response = soup.find("input", {"name": "SAMLResponse"})["value"] + + # Equivalent: Being redirected via JS automatically + # (or clicking "Continue" if you have JS disabled) + url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + data = { + "RelayState": relay_state, + "SAMLResponse": saml_response, + } + logger.debug("Redirect back to ILIAS with login information") + url, text = await self._post(url, data=data) + + # Saving progress: Successfully authenticated with Ilias + self._save_cookies() + + async def _ensure_authenticated(self): + if self._event is None: + self._event = asyncio.Event() + logger.info("Not logged in, authentication required.") + await self._authenticate() + self._event.set() + else: + await self._event.wait() + + def _is_logged_in(self, soup): + userlog = soup.find("li", {"id": "userlog"}) + return userlog is not None + + async def get_webpage(self, ref_id): + url = "https://ilias.studium.kit.edu/goto.php" + params = {"target": f"fold_{ref_id}"} + + while True: + async with self._lock.read(): + logger.info(f"Getting {url} {params}") + _, text = await self._get(url, params=params) + soup = bs4.BeautifulSoup(text, "html.parser") + + if self._is_logged_in(soup): + return soup + else: + await self._ensure_authenticated() + + async def download_file(self, file_id): + async with self._lock.read(): + pass # TODO diff --git a/test.py b/test.py new file mode 100644 index 0000000..4c40bb6 --- /dev/null +++ b/test.py @@ -0,0 +1,30 @@ +import PFERD +import asyncio +import logging +import os +import sys + +logging.basicConfig(level=logging.DEBUG, format=PFERD.LOG_FORMAT) +#logging.basicConfig(level=logging.INFO, format=PFERD.LOG_FORMAT) + +async def test_download(): + auth = PFERD.ShibbolethAuthenticator(cookie_path="cookie_jar") + soup = await auth.get_webpage("885157") + await auth.close() + if soup: + print("Soup acquired!") + else: + print("No soup acquired :(") + +def main(): + #print(f" os.getcwd(): {os.getcwd()}") + #print(f" sys.argv[0]: {sys.argv[0]}") + #print(f" both: {os.path.dirname(os.getcwd() + '/' + sys.argv[0])}") + #print(f" __file__: {__file__}") + #print(f"stackoverflow: {os.path.dirname(os.path.abspath(__file__))}") + + #asyncio.run(test_download(), debug=True) + asyncio.run(test_download()) + +if __name__ == "__main__": + main()