mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Move ilias stuff from aiohttp to requests
This commit is contained in:
parent
0e536f476a
commit
9bae030186
@ -1,14 +1,12 @@
|
|||||||
from .ffm import *
|
#from .ffm import *
|
||||||
from .ilias import *
|
from .ilias import *
|
||||||
from .norbert import *
|
#from .norbert import *
|
||||||
from .utils import *
|
from .utils import *
|
||||||
|
|
||||||
__all__ = (
|
__all__ = []
|
||||||
ffm.__all__ +
|
#__all__ += ffm.__all__
|
||||||
ilias.__all__ +
|
__all__ += ilias.__all__
|
||||||
norbert.__all__ +
|
#__all__ += norbert.__all__
|
||||||
utils.__all__ +
|
__all__ += utils.__all__
|
||||||
[]
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG_FORMAT = "[%(levelname)s] %(message)s"
|
LOG_FORMAT = "[%(levelname)s] %(message)s"
|
||||||
|
@ -1,19 +1,15 @@
|
|||||||
# ILIAS
|
# ILIAS
|
||||||
|
|
||||||
import aiohttp
|
|
||||||
import asyncio
|
|
||||||
import bs4
|
|
||||||
import logging
|
import logging
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .organizer import Organizer
|
import bs4
|
||||||
from .ilias_authenticators import ShibbolethAuthenticator
|
|
||||||
from . import utils
|
|
||||||
|
|
||||||
__all__ = [
|
from .ilias_authenticators import ShibbolethAuthenticator
|
||||||
"ILIAS",
|
from .organizer import Organizer
|
||||||
]
|
|
||||||
|
__all__ = ["ILIAS"]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class ILIAS:
|
class ILIAS:
|
||||||
@ -25,7 +21,7 @@ class ILIAS:
|
|||||||
|
|
||||||
self._auth = ShibbolethAuthenticator(base_path / cookie_file)
|
self._auth = ShibbolethAuthenticator(base_path / cookie_file)
|
||||||
|
|
||||||
async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
|
def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
|
||||||
logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")
|
logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")
|
||||||
|
|
||||||
sync_path = pathlib.Path(self.base_path, to_dir)
|
sync_path = pathlib.Path(self.base_path, to_dir)
|
||||||
@ -33,17 +29,14 @@ class ILIAS:
|
|||||||
|
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
|
files = self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
|
||||||
await self._download(orga, files, transform)
|
self._download(orga, files, transform)
|
||||||
|
|
||||||
orga.clean_sync_dir()
|
orga.clean_sync_dir()
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
async def close(self):
|
def _crawl(self, dir_path, dir_id, filter_):
|
||||||
await self._auth.close()
|
soup = self._auth.get_webpage(dir_id)
|
||||||
|
|
||||||
async def _crawl(self, dir_path, dir_id, filter_):
|
|
||||||
soup = await self._auth.get_webpage(dir_id)
|
|
||||||
|
|
||||||
found_files = []
|
found_files = []
|
||||||
|
|
||||||
@ -59,19 +52,19 @@ class ILIAS:
|
|||||||
logger.debug(f"Found dir {path}")
|
logger.debug(f"Found dir {path}")
|
||||||
if filter_(path):
|
if filter_(path):
|
||||||
logger.info(f"Searching {path}")
|
logger.info(f"Searching {path}")
|
||||||
files = await self._crawl(path, ref_id, filter_)
|
files = self._crawl(path, ref_id, filter_)
|
||||||
found_files.extend(files)
|
found_files.extend(files)
|
||||||
else:
|
else:
|
||||||
logger.info(f"Not searching {path}")
|
logger.info(f"Not searching {path}")
|
||||||
|
|
||||||
return found_files
|
return found_files
|
||||||
|
|
||||||
async def _download(self, orga, files, transform):
|
def _download(self, orga, files, transform):
|
||||||
for (path, file_id) in sorted(files):
|
for (path, file_id) in sorted(files):
|
||||||
to_path = transform(path)
|
to_path = transform(path)
|
||||||
if to_path is not None:
|
if to_path is not None:
|
||||||
temp_path = orga.temp_file()
|
temp_path = orga.temp_file()
|
||||||
await self._auth.download_file(file_id, temp_path)
|
self._auth.download_file(file_id, temp_path)
|
||||||
orga.add_file(temp_path, to_path)
|
orga.add_file(temp_path, to_path)
|
||||||
|
|
||||||
def _find_files(self, soup):
|
def _find_files(self, soup):
|
||||||
|
@ -7,28 +7,21 @@
|
|||||||
# I think the only other method is the password prompt when clicking the log in
|
# I think the only other method is the password prompt when clicking the log in
|
||||||
# button.
|
# button.
|
||||||
|
|
||||||
import aiohttp
|
|
||||||
import asyncio
|
|
||||||
import bs4
|
|
||||||
import getpass
|
import getpass
|
||||||
|
import http.cookiejar
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import urllib.parse
|
|
||||||
|
|
||||||
from .read_write_lock import ReadWriteLock
|
import bs4
|
||||||
from . import utils
|
import requests
|
||||||
|
|
||||||
__all__ = [
|
from .utils import ContentTypeException, stream_to_path
|
||||||
"ShibbolethAuthenticator",
|
|
||||||
]
|
__all__ = ["ShibbolethAuthenticator"]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class ShibbolethAuthenticator:
|
class ShibbolethAuthenticator:
|
||||||
|
|
||||||
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
|
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
|
||||||
|
|
||||||
RETRY_ATTEMPTS = 5
|
|
||||||
RETRY_DELAY = 1 # seconds
|
|
||||||
CHUNK_SIZE = 1024**2
|
CHUNK_SIZE = 1024**2
|
||||||
|
|
||||||
ALLOWED_CONTENT_TYPES = [
|
ALLOWED_CONTENT_TYPES = [
|
||||||
@ -41,71 +34,34 @@ class ShibbolethAuthenticator:
|
|||||||
"image/png",
|
"image/png",
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, cookie_path=None):
|
def __init__(self, cookie_file) -> None:
|
||||||
self._cookie_path = cookie_path
|
# Because LWPCookieJar insists on the path being str-like instead of
|
||||||
|
# Path-like.
|
||||||
|
cookie_file = str(cookie_file)
|
||||||
|
|
||||||
# Authentication and file/page download should not happen at the same time.
|
cookies = http.cookiejar.LWPCookieJar(cookie_file)
|
||||||
# Authenticating counts as writing, file/page downloads as reading.
|
|
||||||
self._lock = ReadWriteLock()
|
|
||||||
|
|
||||||
# Only one self._authenticate() should be started, even if multiple self.get_page()s
|
|
||||||
# notice they're logged in.
|
|
||||||
# If self._event is not None, authenticating is currently in progress.
|
|
||||||
self._event = None
|
|
||||||
|
|
||||||
jar = aiohttp.CookieJar()
|
|
||||||
if self._cookie_path is not None:
|
|
||||||
try:
|
try:
|
||||||
jar.load(self._cookie_path)
|
logger.info(f"Loading old cookies from {cookie_file!r}")
|
||||||
except FileNotFoundError:
|
cookies.load(ignore_discard=True)
|
||||||
pass
|
except (FileNotFoundError, http.cookiejar.LoadError):
|
||||||
self._session = aiohttp.ClientSession(cookie_jar=jar)
|
logger.warn(f"No (valid) cookie file found at {cookie_file!r}, ignoring...")
|
||||||
|
|
||||||
async def close(self):
|
self._session = requests.Session()
|
||||||
await self._session.close()
|
self._session.cookies = cookies
|
||||||
|
|
||||||
async def _post(self, url, params=None, data=None):
|
def _authenticate(self):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
"""
|
||||||
try:
|
Performs the ILIAS Shibboleth authentication dance and saves the login
|
||||||
async with self._session.post(url, params=params, data=data) as resp:
|
cookies it receieves.
|
||||||
text = await resp.text()
|
|
||||||
return resp.url, text
|
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
|
||||||
|
|
||||||
logger.error(f"Could not POST {url} params:{params} data:{data}.")
|
This function should only be called whenever it is detected that you're
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
not logged in. The cookies obtained should be good for a few minutes,
|
||||||
|
maybe even an hour or two.
|
||||||
|
"""
|
||||||
|
|
||||||
async def _get(self, url, params=None):
|
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
|
||||||
try:
|
|
||||||
async with self._session.get(url, params=params) as resp:
|
|
||||||
text = await resp.text()
|
|
||||||
return resp.url, text
|
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
|
||||||
|
|
||||||
logger.error(f"Could not GET {url} params:{params}.")
|
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
|
||||||
|
|
||||||
def _login_successful(self, soup):
|
|
||||||
saml_response = soup.find("input", {"name": "SAMLResponse"})
|
|
||||||
relay_state = soup.find("input", {"name": "RelayState"})
|
|
||||||
return saml_response is not None and relay_state is not None
|
|
||||||
|
|
||||||
def _save_cookies(self):
|
|
||||||
logger.info(f"Saving cookies to {self._cookie_path}")
|
|
||||||
if self._cookie_path is not None:
|
|
||||||
self._session.cookie_jar.save(self._cookie_path)
|
|
||||||
|
|
||||||
# WARNING: Only use self._ensure_authenticated() to authenticate,
|
|
||||||
# don't call self._authenticate() itself.
|
|
||||||
async def _authenticate(self):
|
|
||||||
async with self._lock.write():
|
|
||||||
# Equivalent: Click on "Mit KIT-Account anmelden" button in
|
# Equivalent: Click on "Mit KIT-Account anmelden" button in
|
||||||
# https://ilias.studium.kit.edu/login.php
|
# https://ilias.studium.kit.edu/login.php
|
||||||
|
logger.debug("Begin authentication process with ILIAS")
|
||||||
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
|
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
|
||||||
data = {
|
data = {
|
||||||
"sendLogin": "1",
|
"sendLogin": "1",
|
||||||
@ -113,12 +69,13 @@ class ShibbolethAuthenticator:
|
|||||||
"target": "/shib_login.php",
|
"target": "/shib_login.php",
|
||||||
"home_organization_selection": "Mit KIT-Account anmelden",
|
"home_organization_selection": "Mit KIT-Account anmelden",
|
||||||
}
|
}
|
||||||
logger.debug("Begin authentication process with ILIAS")
|
response = self._session.post(url, data=data)
|
||||||
url, text = await self._post(url, data=data)
|
soup = bs4.BeautifulSoup(response.text, "html.parser")
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
|
||||||
|
|
||||||
# Attempt to login using credentials, if necessary
|
# Attempt to login using credentials, if necessary
|
||||||
while not self._login_successful(soup):
|
while not self._login_successful(soup):
|
||||||
|
# Searching the form here so that this fails before asking for
|
||||||
|
# credentials rather than after asking.
|
||||||
form = soup.find("form", {"class": "form2", "method": "post"})
|
form = soup.find("form", {"class": "form2", "method": "post"})
|
||||||
action = form["action"]
|
action = form["action"]
|
||||||
|
|
||||||
@ -128,103 +85,93 @@ class ShibbolethAuthenticator:
|
|||||||
|
|
||||||
# Equivalent: Enter credentials in
|
# Equivalent: Enter credentials in
|
||||||
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
|
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
|
||||||
|
logger.debug("Attempt to log in to Shibboleth using credentials")
|
||||||
url = "https://idp.scc.kit.edu" + action
|
url = "https://idp.scc.kit.edu" + action
|
||||||
data = {
|
data = {
|
||||||
"_eventId_proceed": "",
|
"_eventId_proceed": "",
|
||||||
"j_username": username,
|
"j_username": username,
|
||||||
"j_password": password,
|
"j_password": password,
|
||||||
}
|
}
|
||||||
logger.debug("Attempt to log in to Shibboleth using credentials")
|
response = self._session.post(url, data=data)
|
||||||
url, text = await self._post(url, data=data)
|
soup = bs4.BeautifulSoup(response.text, "html.parser")
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
|
||||||
|
|
||||||
if not self._login_successful(soup):
|
if not self._login_successful(soup):
|
||||||
print("Incorrect credentials.")
|
print("Incorrect credentials.")
|
||||||
|
|
||||||
# Saving progress: Successfully authenticated with Shibboleth
|
# Saving progress
|
||||||
self._save_cookies()
|
logger.info("Saving cookies (successfully authenticated with Shibboleth)")
|
||||||
|
self._session.cookies.save(ignore_discard=True)
|
||||||
relay_state = soup.find("input", {"name": "RelayState"})["value"]
|
|
||||||
saml_response = soup.find("input", {"name": "SAMLResponse"})["value"]
|
|
||||||
|
|
||||||
# Equivalent: Being redirected via JS automatically
|
# Equivalent: Being redirected via JS automatically
|
||||||
# (or clicking "Continue" if you have JS disabled)
|
# (or clicking "Continue" if you have JS disabled)
|
||||||
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
|
|
||||||
data = {
|
|
||||||
"RelayState": relay_state,
|
|
||||||
"SAMLResponse": saml_response,
|
|
||||||
}
|
|
||||||
logger.debug("Redirect back to ILIAS with login information")
|
logger.debug("Redirect back to ILIAS with login information")
|
||||||
url, text = await self._post(url, data=data)
|
relay_state = soup.find("input", {"name": "RelayState"})
|
||||||
|
saml_response = soup.find("input", {"name": "SAMLResponse"})
|
||||||
|
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
|
||||||
|
data = { # using the info obtained in the while loop above
|
||||||
|
"RelayState": relay_state["value"],
|
||||||
|
"SAMLResponse": saml_response["value"],
|
||||||
|
}
|
||||||
|
self._session.post(url, data=data)
|
||||||
|
|
||||||
# Saving progress: Successfully authenticated with Ilias
|
# Saving progress
|
||||||
self._save_cookies()
|
logger.info("Saving cookies (successfully authenticated with ILIAS)")
|
||||||
|
self._session.cookies.save(ignore_discard=True)
|
||||||
|
|
||||||
async def _ensure_authenticated(self):
|
def _login_successful(self, soup):
|
||||||
if self._event is None:
|
relay_state = soup.find("input", {"name": "RelayState"})
|
||||||
self._event = asyncio.Event()
|
saml_response = soup.find("input", {"name": "SAMLResponse"})
|
||||||
logger.info("Not logged in, authentication required.")
|
return relay_state is not None and saml_response is not None
|
||||||
await self._authenticate()
|
|
||||||
self._event.set()
|
|
||||||
self._event = None
|
|
||||||
else:
|
|
||||||
await self._event.wait()
|
|
||||||
|
|
||||||
def _is_logged_in(self, soup):
|
def _is_logged_in(self, soup):
|
||||||
userlog = soup.find("li", {"id": "userlog"})
|
userlog = soup.find("li", {"id": "userlog"})
|
||||||
return userlog is not None
|
return userlog is not None
|
||||||
|
|
||||||
async def get_webpage_refid(self, ref_id):
|
def get_webpage(self, object_id):
|
||||||
return await self.get_webpage(f"fold_{ref_id}")
|
|
||||||
|
|
||||||
async def get_webpage(self, object_id):
|
|
||||||
params = {"target": object_id}
|
params = {"target": object_id}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
async with self._lock.read():
|
|
||||||
logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
|
logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
|
||||||
_, text = await self._get(self.ILIAS_GOTO, params=params)
|
response = self._session.get(self.ILIAS_GOTO, params=params)
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return soup
|
return soup
|
||||||
else:
|
else:
|
||||||
await self._ensure_authenticated()
|
logger.info("Not logged in, authenticating...")
|
||||||
|
self._authenticate()
|
||||||
|
|
||||||
async def _download(self, url, params, to_path):
|
def get_webpage_by_refid(self, ref_id):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
return self.get_webpage(f"fold_{ref_id}")
|
||||||
try:
|
|
||||||
async with self._session.get(url, params=params) as resp:
|
def _download(self, url, params, to_path):
|
||||||
if resp.content_type in self.ALLOWED_CONTENT_TYPES:
|
with self._session.get(url, params=params, stream=True) as response:
|
||||||
# Yay, we got the file (as long as it's a PDF)
|
content_type = response.headers["content-type"]
|
||||||
await utils.stream_to_path(resp, to_path)
|
|
||||||
|
if content_type in self.ALLOWED_CONTENT_TYPES:
|
||||||
|
# Yay, we got the file :)
|
||||||
|
stream_to_path(response, to_path)
|
||||||
return True
|
return True
|
||||||
elif resp.content_type == "text/html":
|
elif content_type == "text/html":
|
||||||
# Dangit, we're probably not logged in.
|
# Dangit, we're probably not logged in.
|
||||||
text = await resp.text()
|
soup = bs4.BeautifulSoup(response.text, "html.parser")
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).")
|
raise ContentTypeException(
|
||||||
|
"Attempting to download a web page, not a file")
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
# What *did* we get?
|
# What *did* we get?
|
||||||
raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.")
|
raise ContentTypeException(
|
||||||
|
f"Unknown file of type {content_type}")
|
||||||
|
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
def download_file(self, file_id, to_path):
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
|
||||||
|
|
||||||
logger.error(f"Could not download {url} params:{params}.")
|
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
|
||||||
|
|
||||||
async def download_file(self, file_id, to_path):
|
|
||||||
params = {"target": file_id}
|
params = {"target": file_id}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
async with self._lock.read():
|
success = self._download(self.ILIAS_GOTO, params, to_path)
|
||||||
success = await self._download(self.ILIAS_GOTO, params, to_path)
|
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
await self._ensure_authenticated()
|
logger.info("Not logged in, authenticating...")
|
||||||
|
self._authenticate()
|
||||||
|
@ -6,8 +6,7 @@ __all__ = [
|
|||||||
"move",
|
"move",
|
||||||
"rename",
|
"rename",
|
||||||
"stream_to_path",
|
"stream_to_path",
|
||||||
"OutOfTriesException",
|
"ContentTypeException",
|
||||||
"UnknownFileTypeException",
|
|
||||||
"FileNotFoundException",
|
"FileNotFoundException",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -22,18 +21,12 @@ def move(path, from_folders, to_folders):
|
|||||||
def rename(path, to_name):
|
def rename(path, to_name):
|
||||||
return pathlib.PurePath(*path.parts[:-1], to_name)
|
return pathlib.PurePath(*path.parts[:-1], to_name)
|
||||||
|
|
||||||
async def stream_to_path(resp, to_path, chunk_size=1024**2):
|
def stream_to_path(response, to_path, chunk_size=1024**2):
|
||||||
with open(to_path, 'wb') as fd:
|
with open(to_path, 'wb') as fd:
|
||||||
while True:
|
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||||
chunk = await resp.content.read(chunk_size)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
fd.write(chunk)
|
fd.write(chunk)
|
||||||
|
|
||||||
class OutOfTriesException(Exception):
|
class ContentTypeException(Exception):
|
||||||
pass
|
|
||||||
|
|
||||||
class UnknownFileTypeException(Exception):
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class FileNotFoundException(Exception):
|
class FileNotFoundException(Exception):
|
||||||
|
Loading…
Reference in New Issue
Block a user