From 9bae030186888839431a3c5f3dbb7a223d30460d Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 25 Apr 2019 18:52:48 +0000 Subject: [PATCH] Move ilias stuff from aiohttp to requests --- PFERD/__init__.py | 16 +- PFERD/ilias.py | 33 ++--- PFERD/ilias_authenticators.py | 265 ++++++++++++++-------------------- PFERD/utils.py | 15 +- 4 files changed, 130 insertions(+), 199 deletions(-) diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 13c9f2b..232a361 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,14 +1,12 @@ -from .ffm import * +#from .ffm import * from .ilias import * -from .norbert import * +#from .norbert import * from .utils import * -__all__ = ( - ffm.__all__ + - ilias.__all__ + - norbert.__all__ + - utils.__all__ + - [] -) +__all__ = [] +#__all__ += ffm.__all__ +__all__ += ilias.__all__ +#__all__ += norbert.__all__ +__all__ += utils.__all__ LOG_FORMAT = "[%(levelname)s] %(message)s" diff --git a/PFERD/ilias.py b/PFERD/ilias.py index aace379..96ed7b5 100644 --- a/PFERD/ilias.py +++ b/PFERD/ilias.py @@ -1,19 +1,15 @@ # ILIAS -import aiohttp -import asyncio -import bs4 import logging import pathlib import re -from .organizer import Organizer -from .ilias_authenticators import ShibbolethAuthenticator -from . import utils +import bs4 -__all__ = [ - "ILIAS", -] +from .ilias_authenticators import ShibbolethAuthenticator +from .organizer import Organizer + +__all__ = ["ILIAS"] logger = logging.getLogger(__name__) class ILIAS: @@ -25,7 +21,7 @@ class ILIAS: self._auth = ShibbolethAuthenticator(base_path / cookie_file) - async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): + def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.") sync_path = pathlib.Path(self.base_path, to_dir) @@ -33,17 +29,14 @@ class ILIAS: orga.clean_temp_dir() - files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) - await self._download(orga, files, transform) + files = self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) + self._download(orga, files, transform) orga.clean_sync_dir() orga.clean_temp_dir() - async def close(self): - await self._auth.close() - - async def _crawl(self, dir_path, dir_id, filter_): - soup = await self._auth.get_webpage(dir_id) + def _crawl(self, dir_path, dir_id, filter_): + soup = self._auth.get_webpage(dir_id) found_files = [] @@ -59,19 +52,19 @@ class ILIAS: logger.debug(f"Found dir {path}") if filter_(path): logger.info(f"Searching {path}") - files = await self._crawl(path, ref_id, filter_) + files = self._crawl(path, ref_id, filter_) found_files.extend(files) else: logger.info(f"Not searching {path}") return found_files - async def _download(self, orga, files, transform): + def _download(self, orga, files, transform): for (path, file_id) in sorted(files): to_path = transform(path) if to_path is not None: temp_path = orga.temp_file() - await self._auth.download_file(file_id, temp_path) + self._auth.download_file(file_id, temp_path) orga.add_file(temp_path, to_path) def _find_files(self, soup): diff --git a/PFERD/ilias_authenticators.py b/PFERD/ilias_authenticators.py index 9a87e2d..e1451ab 100644 --- a/PFERD/ilias_authenticators.py +++ b/PFERD/ilias_authenticators.py @@ -7,28 +7,21 @@ # I think the only other method is the password prompt when clicking the log in # button. -import aiohttp -import asyncio -import bs4 import getpass +import http.cookiejar import logging import time -import urllib.parse -from .read_write_lock import ReadWriteLock -from . import utils +import bs4 +import requests -__all__ = [ - "ShibbolethAuthenticator", -] +from .utils import ContentTypeException, stream_to_path + +__all__ = ["ShibbolethAuthenticator"] logger = logging.getLogger(__name__) class ShibbolethAuthenticator: - ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" - - RETRY_ATTEMPTS = 5 - RETRY_DELAY = 1 # seconds CHUNK_SIZE = 1024**2 ALLOWED_CONTENT_TYPES = [ @@ -41,190 +34,144 @@ class ShibbolethAuthenticator: "image/png", ] - def __init__(self, cookie_path=None): - self._cookie_path = cookie_path + def __init__(self, cookie_file) -> None: + # Because LWPCookieJar insists on the path being str-like instead of + # Path-like. + cookie_file = str(cookie_file) - # Authentication and file/page download should not happen at the same time. - # Authenticating counts as writing, file/page downloads as reading. - self._lock = ReadWriteLock() + cookies = http.cookiejar.LWPCookieJar(cookie_file) + try: + logger.info(f"Loading old cookies from {cookie_file!r}") + cookies.load(ignore_discard=True) + except (FileNotFoundError, http.cookiejar.LoadError): + logger.warn(f"No (valid) cookie file found at {cookie_file!r}, ignoring...") - # Only one self._authenticate() should be started, even if multiple self.get_page()s - # notice they're logged in. - # If self._event is not None, authenticating is currently in progress. - self._event = None + self._session = requests.Session() + self._session.cookies = cookies - jar = aiohttp.CookieJar() - if self._cookie_path is not None: - try: - jar.load(self._cookie_path) - except FileNotFoundError: - pass - self._session = aiohttp.ClientSession(cookie_jar=jar) + def _authenticate(self): + """ + Performs the ILIAS Shibboleth authentication dance and saves the login + cookies it receieves. - async def close(self): - await self._session.close() + This function should only be called whenever it is detected that you're + not logged in. The cookies obtained should be good for a few minutes, + maybe even an hour or two. + """ - async def _post(self, url, params=None, data=None): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.post(url, params=params, data=data) as resp: - text = await resp.text() - return resp.url, text - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) - - logger.error(f"Could not POST {url} params:{params} data:{data}.") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") - - async def _get(self, url, params=None): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url, params=params) as resp: - text = await resp.text() - return resp.url, text - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) - - logger.error(f"Could not GET {url} params:{params}.") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") - - def _login_successful(self, soup): - saml_response = soup.find("input", {"name": "SAMLResponse"}) - relay_state = soup.find("input", {"name": "RelayState"}) - return saml_response is not None and relay_state is not None - - def _save_cookies(self): - logger.info(f"Saving cookies to {self._cookie_path}") - if self._cookie_path is not None: - self._session.cookie_jar.save(self._cookie_path) - - # WARNING: Only use self._ensure_authenticated() to authenticate, - # don't call self._authenticate() itself. - async def _authenticate(self): - async with self._lock.write(): - # Equivalent: Click on "Mit KIT-Account anmelden" button in - # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" - data = { + # Equivalent: Click on "Mit KIT-Account anmelden" button in + # https://ilias.studium.kit.edu/login.php + logger.debug("Begin authentication process with ILIAS") + url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", "target": "/shib_login.php", "home_organization_selection": "Mit KIT-Account anmelden", - } - logger.debug("Begin authentication process with ILIAS") - url, text = await self._post(url, data=data) - soup = bs4.BeautifulSoup(text, "html.parser") + } + response = self._session.post(url, data=data) + soup = bs4.BeautifulSoup(response.text, "html.parser") - # Attempt to login using credentials, if necessary - while not self._login_successful(soup): - form = soup.find("form", {"class": "form2", "method": "post"}) - action = form["action"] + # Attempt to login using credentials, if necessary + while not self._login_successful(soup): + # Searching the form here so that this fails before asking for + # credentials rather than after asking. + form = soup.find("form", {"class": "form2", "method": "post"}) + action = form["action"] - print("Please enter Shibboleth credentials.") - username = getpass.getpass(prompt="Username: ") - password = getpass.getpass(prompt="Password: ") + print("Please enter Shibboleth credentials.") + username = getpass.getpass(prompt="Username: ") + password = getpass.getpass(prompt="Password: ") - # Equivalent: Enter credentials in - # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO - url = "https://idp.scc.kit.edu" + action - data = { + # Equivalent: Enter credentials in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + logger.debug("Attempt to log in to Shibboleth using credentials") + url = "https://idp.scc.kit.edu" + action + data = { "_eventId_proceed": "", "j_username": username, "j_password": password, - } - logger.debug("Attempt to log in to Shibboleth using credentials") - url, text = await self._post(url, data=data) - soup = bs4.BeautifulSoup(text, "html.parser") - - if not self._login_successful(soup): - print("Incorrect credentials.") - - # Saving progress: Successfully authenticated with Shibboleth - self._save_cookies() - - relay_state = soup.find("input", {"name": "RelayState"})["value"] - saml_response = soup.find("input", {"name": "SAMLResponse"})["value"] - - # Equivalent: Being redirected via JS automatically - # (or clicking "Continue" if you have JS disabled) - url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" - data = { - "RelayState": relay_state, - "SAMLResponse": saml_response, } - logger.debug("Redirect back to ILIAS with login information") - url, text = await self._post(url, data=data) + response = self._session.post(url, data=data) + soup = bs4.BeautifulSoup(response.text, "html.parser") - # Saving progress: Successfully authenticated with Ilias - self._save_cookies() + if not self._login_successful(soup): + print("Incorrect credentials.") - async def _ensure_authenticated(self): - if self._event is None: - self._event = asyncio.Event() - logger.info("Not logged in, authentication required.") - await self._authenticate() - self._event.set() - self._event = None - else: - await self._event.wait() + # Saving progress + logger.info("Saving cookies (successfully authenticated with Shibboleth)") + self._session.cookies.save(ignore_discard=True) + + # Equivalent: Being redirected via JS automatically + # (or clicking "Continue" if you have JS disabled) + logger.debug("Redirect back to ILIAS with login information") + relay_state = soup.find("input", {"name": "RelayState"}) + saml_response = soup.find("input", {"name": "SAMLResponse"}) + url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + data = { # using the info obtained in the while loop above + "RelayState": relay_state["value"], + "SAMLResponse": saml_response["value"], + } + self._session.post(url, data=data) + + # Saving progress + logger.info("Saving cookies (successfully authenticated with ILIAS)") + self._session.cookies.save(ignore_discard=True) + + def _login_successful(self, soup): + relay_state = soup.find("input", {"name": "RelayState"}) + saml_response = soup.find("input", {"name": "SAMLResponse"}) + return relay_state is not None and saml_response is not None def _is_logged_in(self, soup): userlog = soup.find("li", {"id": "userlog"}) return userlog is not None - async def get_webpage_refid(self, ref_id): - return await self.get_webpage(f"fold_{ref_id}") - - async def get_webpage(self, object_id): + def get_webpage(self, object_id): params = {"target": object_id} while True: - async with self._lock.read(): - logger.debug(f"Getting {self.ILIAS_GOTO} {params}") - _, text = await self._get(self.ILIAS_GOTO, params=params) - soup = bs4.BeautifulSoup(text, "html.parser") + logger.debug(f"Getting {self.ILIAS_GOTO} {params}") + response = self._session.get(self.ILIAS_GOTO, params=params) + soup = bs4.BeautifulSoup(response.text, "html.parser") if self._is_logged_in(soup): return soup else: - await self._ensure_authenticated() + logger.info("Not logged in, authenticating...") + self._authenticate() - async def _download(self, url, params, to_path): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url, params=params) as resp: - if resp.content_type in self.ALLOWED_CONTENT_TYPES: - # Yay, we got the file (as long as it's a PDF) - await utils.stream_to_path(resp, to_path) - return True - elif resp.content_type == "text/html": - # Dangit, we're probably not logged in. - text = await resp.text() - soup = bs4.BeautifulSoup(text, "html.parser") - if self._is_logged_in(soup): - raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).") - return False - else: - # What *did* we get? - raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.") + def get_webpage_by_refid(self, ref_id): + return self.get_webpage(f"fold_{ref_id}") - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) + def _download(self, url, params, to_path): + with self._session.get(url, params=params, stream=True) as response: + content_type = response.headers["content-type"] - logger.error(f"Could not download {url} params:{params}.") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + if content_type in self.ALLOWED_CONTENT_TYPES: + # Yay, we got the file :) + stream_to_path(response, to_path) + return True + elif content_type == "text/html": + # Dangit, we're probably not logged in. + soup = bs4.BeautifulSoup(response.text, "html.parser") + if self._is_logged_in(soup): + raise ContentTypeException( + "Attempting to download a web page, not a file") + return False + else: + # What *did* we get? + raise ContentTypeException( + f"Unknown file of type {content_type}") - async def download_file(self, file_id, to_path): + def download_file(self, file_id, to_path): params = {"target": file_id} while True: - async with self._lock.read(): - success = await self._download(self.ILIAS_GOTO, params, to_path) + success = self._download(self.ILIAS_GOTO, params, to_path) if success: return else: - await self._ensure_authenticated() + logger.info("Not logged in, authenticating...") + self._authenticate() diff --git a/PFERD/utils.py b/PFERD/utils.py index acdfe58..0a9fb7a 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -6,8 +6,7 @@ __all__ = [ "move", "rename", "stream_to_path", - "OutOfTriesException", - "UnknownFileTypeException", + "ContentTypeException", "FileNotFoundException", ] @@ -22,18 +21,12 @@ def move(path, from_folders, to_folders): def rename(path, to_name): return pathlib.PurePath(*path.parts[:-1], to_name) -async def stream_to_path(resp, to_path, chunk_size=1024**2): +def stream_to_path(response, to_path, chunk_size=1024**2): with open(to_path, 'wb') as fd: - while True: - chunk = await resp.content.read(chunk_size) - if not chunk: - break + for chunk in response.iter_content(chunk_size=chunk_size): fd.write(chunk) -class OutOfTriesException(Exception): - pass - -class UnknownFileTypeException(Exception): +class ContentTypeException(Exception): pass class FileNotFoundException(Exception):