Move ilias stuff from aiohttp to requests

This commit is contained in:
Joscha 2019-04-25 18:52:48 +00:00
parent 0e536f476a
commit 9bae030186
4 changed files with 130 additions and 199 deletions

View File

@ -1,14 +1,12 @@
from .ffm import * #from .ffm import *
from .ilias import * from .ilias import *
from .norbert import * #from .norbert import *
from .utils import * from .utils import *
__all__ = ( __all__ = []
ffm.__all__ + #__all__ += ffm.__all__
ilias.__all__ + __all__ += ilias.__all__
norbert.__all__ + #__all__ += norbert.__all__
utils.__all__ + __all__ += utils.__all__
[]
)
LOG_FORMAT = "[%(levelname)s] %(message)s" LOG_FORMAT = "[%(levelname)s] %(message)s"

View File

@ -1,19 +1,15 @@
# ILIAS # ILIAS
import aiohttp
import asyncio
import bs4
import logging import logging
import pathlib import pathlib
import re import re
from .organizer import Organizer import bs4
from .ilias_authenticators import ShibbolethAuthenticator
from . import utils
__all__ = [ from .ilias_authenticators import ShibbolethAuthenticator
"ILIAS", from .organizer import Organizer
]
__all__ = ["ILIAS"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ILIAS: class ILIAS:
@ -25,7 +21,7 @@ class ILIAS:
self._auth = ShibbolethAuthenticator(base_path / cookie_file) self._auth = ShibbolethAuthenticator(base_path / cookie_file)
async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.") logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
@ -33,17 +29,14 @@ class ILIAS:
orga.clean_temp_dir() orga.clean_temp_dir()
files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) files = self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
await self._download(orga, files, transform) self._download(orga, files, transform)
orga.clean_sync_dir() orga.clean_sync_dir()
orga.clean_temp_dir() orga.clean_temp_dir()
async def close(self): def _crawl(self, dir_path, dir_id, filter_):
await self._auth.close() soup = self._auth.get_webpage(dir_id)
async def _crawl(self, dir_path, dir_id, filter_):
soup = await self._auth.get_webpage(dir_id)
found_files = [] found_files = []
@ -59,19 +52,19 @@ class ILIAS:
logger.debug(f"Found dir {path}") logger.debug(f"Found dir {path}")
if filter_(path): if filter_(path):
logger.info(f"Searching {path}") logger.info(f"Searching {path}")
files = await self._crawl(path, ref_id, filter_) files = self._crawl(path, ref_id, filter_)
found_files.extend(files) found_files.extend(files)
else: else:
logger.info(f"Not searching {path}") logger.info(f"Not searching {path}")
return found_files return found_files
async def _download(self, orga, files, transform): def _download(self, orga, files, transform):
for (path, file_id) in sorted(files): for (path, file_id) in sorted(files):
to_path = transform(path) to_path = transform(path)
if to_path is not None: if to_path is not None:
temp_path = orga.temp_file() temp_path = orga.temp_file()
await self._auth.download_file(file_id, temp_path) self._auth.download_file(file_id, temp_path)
orga.add_file(temp_path, to_path) orga.add_file(temp_path, to_path)
def _find_files(self, soup): def _find_files(self, soup):

View File

@ -7,28 +7,21 @@
# I think the only other method is the password prompt when clicking the log in # I think the only other method is the password prompt when clicking the log in
# button. # button.
import aiohttp
import asyncio
import bs4
import getpass import getpass
import http.cookiejar
import logging import logging
import time import time
import urllib.parse
from .read_write_lock import ReadWriteLock import bs4
from . import utils import requests
__all__ = [ from .utils import ContentTypeException, stream_to_path
"ShibbolethAuthenticator",
] __all__ = ["ShibbolethAuthenticator"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ShibbolethAuthenticator: class ShibbolethAuthenticator:
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
RETRY_ATTEMPTS = 5
RETRY_DELAY = 1 # seconds
CHUNK_SIZE = 1024**2 CHUNK_SIZE = 1024**2
ALLOWED_CONTENT_TYPES = [ ALLOWED_CONTENT_TYPES = [
@ -41,190 +34,144 @@ class ShibbolethAuthenticator:
"image/png", "image/png",
] ]
def __init__(self, cookie_path=None): def __init__(self, cookie_file) -> None:
self._cookie_path = cookie_path # Because LWPCookieJar insists on the path being str-like instead of
# Path-like.
cookie_file = str(cookie_file)
# Authentication and file/page download should not happen at the same time. cookies = http.cookiejar.LWPCookieJar(cookie_file)
# Authenticating counts as writing, file/page downloads as reading. try:
self._lock = ReadWriteLock() logger.info(f"Loading old cookies from {cookie_file!r}")
cookies.load(ignore_discard=True)
except (FileNotFoundError, http.cookiejar.LoadError):
logger.warn(f"No (valid) cookie file found at {cookie_file!r}, ignoring...")
# Only one self._authenticate() should be started, even if multiple self.get_page()s self._session = requests.Session()
# notice they're logged in. self._session.cookies = cookies
# If self._event is not None, authenticating is currently in progress.
self._event = None
jar = aiohttp.CookieJar() def _authenticate(self):
if self._cookie_path is not None: """
try: Performs the ILIAS Shibboleth authentication dance and saves the login
jar.load(self._cookie_path) cookies it receieves.
except FileNotFoundError:
pass
self._session = aiohttp.ClientSession(cookie_jar=jar)
async def close(self): This function should only be called whenever it is detected that you're
await self._session.close() not logged in. The cookies obtained should be good for a few minutes,
maybe even an hour or two.
"""
async def _post(self, url, params=None, data=None): # Equivalent: Click on "Mit KIT-Account anmelden" button in
for t in range(self.RETRY_ATTEMPTS): # https://ilias.studium.kit.edu/login.php
try: logger.debug("Begin authentication process with ILIAS")
async with self._session.post(url, params=params, data=data) as resp: url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
text = await resp.text() data = {
return resp.url, text
except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY)
logger.error(f"Could not POST {url} params:{params} data:{data}.")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
async def _get(self, url, params=None):
for t in range(self.RETRY_ATTEMPTS):
try:
async with self._session.get(url, params=params) as resp:
text = await resp.text()
return resp.url, text
except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY)
logger.error(f"Could not GET {url} params:{params}.")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
def _login_successful(self, soup):
saml_response = soup.find("input", {"name": "SAMLResponse"})
relay_state = soup.find("input", {"name": "RelayState"})
return saml_response is not None and relay_state is not None
def _save_cookies(self):
logger.info(f"Saving cookies to {self._cookie_path}")
if self._cookie_path is not None:
self._session.cookie_jar.save(self._cookie_path)
# WARNING: Only use self._ensure_authenticated() to authenticate,
# don't call self._authenticate() itself.
async def _authenticate(self):
async with self._lock.write():
# Equivalent: Click on "Mit KIT-Account anmelden" button in
# https://ilias.studium.kit.edu/login.php
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
data = {
"sendLogin": "1", "sendLogin": "1",
"idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
"target": "/shib_login.php", "target": "/shib_login.php",
"home_organization_selection": "Mit KIT-Account anmelden", "home_organization_selection": "Mit KIT-Account anmelden",
} }
logger.debug("Begin authentication process with ILIAS") response = self._session.post(url, data=data)
url, text = await self._post(url, data=data) soup = bs4.BeautifulSoup(response.text, "html.parser")
soup = bs4.BeautifulSoup(text, "html.parser")
# Attempt to login using credentials, if necessary # Attempt to login using credentials, if necessary
while not self._login_successful(soup): while not self._login_successful(soup):
form = soup.find("form", {"class": "form2", "method": "post"}) # Searching the form here so that this fails before asking for
action = form["action"] # credentials rather than after asking.
form = soup.find("form", {"class": "form2", "method": "post"})
action = form["action"]
print("Please enter Shibboleth credentials.") print("Please enter Shibboleth credentials.")
username = getpass.getpass(prompt="Username: ") username = getpass.getpass(prompt="Username: ")
password = getpass.getpass(prompt="Password: ") password = getpass.getpass(prompt="Password: ")
# Equivalent: Enter credentials in # Equivalent: Enter credentials in
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
url = "https://idp.scc.kit.edu" + action logger.debug("Attempt to log in to Shibboleth using credentials")
data = { url = "https://idp.scc.kit.edu" + action
data = {
"_eventId_proceed": "", "_eventId_proceed": "",
"j_username": username, "j_username": username,
"j_password": password, "j_password": password,
}
logger.debug("Attempt to log in to Shibboleth using credentials")
url, text = await self._post(url, data=data)
soup = bs4.BeautifulSoup(text, "html.parser")
if not self._login_successful(soup):
print("Incorrect credentials.")
# Saving progress: Successfully authenticated with Shibboleth
self._save_cookies()
relay_state = soup.find("input", {"name": "RelayState"})["value"]
saml_response = soup.find("input", {"name": "SAMLResponse"})["value"]
# Equivalent: Being redirected via JS automatically
# (or clicking "Continue" if you have JS disabled)
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
data = {
"RelayState": relay_state,
"SAMLResponse": saml_response,
} }
logger.debug("Redirect back to ILIAS with login information") response = self._session.post(url, data=data)
url, text = await self._post(url, data=data) soup = bs4.BeautifulSoup(response.text, "html.parser")
# Saving progress: Successfully authenticated with Ilias if not self._login_successful(soup):
self._save_cookies() print("Incorrect credentials.")
async def _ensure_authenticated(self): # Saving progress
if self._event is None: logger.info("Saving cookies (successfully authenticated with Shibboleth)")
self._event = asyncio.Event() self._session.cookies.save(ignore_discard=True)
logger.info("Not logged in, authentication required.")
await self._authenticate() # Equivalent: Being redirected via JS automatically
self._event.set() # (or clicking "Continue" if you have JS disabled)
self._event = None logger.debug("Redirect back to ILIAS with login information")
else: relay_state = soup.find("input", {"name": "RelayState"})
await self._event.wait() saml_response = soup.find("input", {"name": "SAMLResponse"})
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
data = { # using the info obtained in the while loop above
"RelayState": relay_state["value"],
"SAMLResponse": saml_response["value"],
}
self._session.post(url, data=data)
# Saving progress
logger.info("Saving cookies (successfully authenticated with ILIAS)")
self._session.cookies.save(ignore_discard=True)
def _login_successful(self, soup):
relay_state = soup.find("input", {"name": "RelayState"})
saml_response = soup.find("input", {"name": "SAMLResponse"})
return relay_state is not None and saml_response is not None
def _is_logged_in(self, soup): def _is_logged_in(self, soup):
userlog = soup.find("li", {"id": "userlog"}) userlog = soup.find("li", {"id": "userlog"})
return userlog is not None return userlog is not None
async def get_webpage_refid(self, ref_id): def get_webpage(self, object_id):
return await self.get_webpage(f"fold_{ref_id}")
async def get_webpage(self, object_id):
params = {"target": object_id} params = {"target": object_id}
while True: while True:
async with self._lock.read(): logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
logger.debug(f"Getting {self.ILIAS_GOTO} {params}") response = self._session.get(self.ILIAS_GOTO, params=params)
_, text = await self._get(self.ILIAS_GOTO, params=params) soup = bs4.BeautifulSoup(response.text, "html.parser")
soup = bs4.BeautifulSoup(text, "html.parser")
if self._is_logged_in(soup): if self._is_logged_in(soup):
return soup return soup
else: else:
await self._ensure_authenticated() logger.info("Not logged in, authenticating...")
self._authenticate()
async def _download(self, url, params, to_path): def get_webpage_by_refid(self, ref_id):
for t in range(self.RETRY_ATTEMPTS): return self.get_webpage(f"fold_{ref_id}")
try:
async with self._session.get(url, params=params) as resp:
if resp.content_type in self.ALLOWED_CONTENT_TYPES:
# Yay, we got the file (as long as it's a PDF)
await utils.stream_to_path(resp, to_path)
return True
elif resp.content_type == "text/html":
# Dangit, we're probably not logged in.
text = await resp.text()
soup = bs4.BeautifulSoup(text, "html.parser")
if self._is_logged_in(soup):
raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).")
return False
else:
# What *did* we get?
raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.")
except aiohttp.client_exceptions.ServerDisconnectedError: def _download(self, url, params, to_path):
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") with self._session.get(url, params=params, stream=True) as response:
await asyncio.sleep(self.RETRY_DELAY) content_type = response.headers["content-type"]
logger.error(f"Could not download {url} params:{params}.") if content_type in self.ALLOWED_CONTENT_TYPES:
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") # Yay, we got the file :)
stream_to_path(response, to_path)
return True
elif content_type == "text/html":
# Dangit, we're probably not logged in.
soup = bs4.BeautifulSoup(response.text, "html.parser")
if self._is_logged_in(soup):
raise ContentTypeException(
"Attempting to download a web page, not a file")
return False
else:
# What *did* we get?
raise ContentTypeException(
f"Unknown file of type {content_type}")
async def download_file(self, file_id, to_path): def download_file(self, file_id, to_path):
params = {"target": file_id} params = {"target": file_id}
while True: while True:
async with self._lock.read(): success = self._download(self.ILIAS_GOTO, params, to_path)
success = await self._download(self.ILIAS_GOTO, params, to_path)
if success: if success:
return return
else: else:
await self._ensure_authenticated() logger.info("Not logged in, authenticating...")
self._authenticate()

View File

@ -6,8 +6,7 @@ __all__ = [
"move", "move",
"rename", "rename",
"stream_to_path", "stream_to_path",
"OutOfTriesException", "ContentTypeException",
"UnknownFileTypeException",
"FileNotFoundException", "FileNotFoundException",
] ]
@ -22,18 +21,12 @@ def move(path, from_folders, to_folders):
def rename(path, to_name): def rename(path, to_name):
return pathlib.PurePath(*path.parts[:-1], to_name) return pathlib.PurePath(*path.parts[:-1], to_name)
async def stream_to_path(resp, to_path, chunk_size=1024**2): def stream_to_path(response, to_path, chunk_size=1024**2):
with open(to_path, 'wb') as fd: with open(to_path, 'wb') as fd:
while True: for chunk in response.iter_content(chunk_size=chunk_size):
chunk = await resp.content.read(chunk_size)
if not chunk:
break
fd.write(chunk) fd.write(chunk)
class OutOfTriesException(Exception): class ContentTypeException(Exception):
pass
class UnknownFileTypeException(Exception):
pass pass
class FileNotFoundException(Exception): class FileNotFoundException(Exception):