From 5a1bf2188b598e95d03029b12397fc92ad31f6a6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 24 Apr 2019 12:34:20 +0000 Subject: [PATCH] Switch from tabs to spaces --- PFERD/__init__.py | 10 +- PFERD/ffm.py | 96 +++++----- PFERD/ilias.py | 150 +++++++-------- PFERD/ilias_authenticators.py | 338 +++++++++++++++++----------------- PFERD/norbert.py | 164 ++++++++--------- PFERD/organizer.py | 208 ++++++++++----------- PFERD/utils.py | 42 ++--- 7 files changed, 504 insertions(+), 504 deletions(-) diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 3937db4..13c9f2b 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -4,11 +4,11 @@ from .norbert import * from .utils import * __all__ = ( - ffm.__all__ + - ilias.__all__ + - norbert.__all__ + - utils.__all__ + - [] + ffm.__all__ + + ilias.__all__ + + norbert.__all__ + + utils.__all__ + + [] ) LOG_FORMAT = "[%(levelname)s] %(message)s" diff --git a/PFERD/ffm.py b/PFERD/ffm.py index 0116176..54890e9 100644 --- a/PFERD/ffm.py +++ b/PFERD/ffm.py @@ -11,69 +11,69 @@ from .organizer import Organizer from . import utils __all__ = [ - "FfM", + "FfM", ] logger = logging.getLogger(__name__) class FfM: - BASE_URL = "http://www.math.kit.edu/" - LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") + BASE_URL = "http://www.math.kit.edu/" + LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") - RETRY_ATTEMPTS = 5 - RETRY_DELAY = 1 # seconds + RETRY_ATTEMPTS = 5 + RETRY_DELAY = 1 # seconds - def __init__(self, base_path): - self.base_path = base_path + def __init__(self, base_path): + self.base_path = base_path - self._session = aiohttp.ClientSession() + self._session = aiohttp.ClientSession() - async def synchronize(self, urlpart, to_dir, transform=lambda x: x): - logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") + async def synchronize(self, urlpart, to_dir, transform=lambda x: x): + logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") - sync_path = pathlib.Path(self.base_path, to_dir) - orga = Organizer(self.base_path, sync_path) + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) - orga.clean_temp_dir() + orga.clean_temp_dir() - await self._crawl(orga, urlpart, transform) + await self._crawl(orga, urlpart, transform) - orga.clean_sync_dir() - orga.clean_temp_dir() + orga.clean_sync_dir() + orga.clean_temp_dir() - async def close(self): - await self._session.close() + async def close(self): + await self._session.close() - async def _crawl(self, orga, urlpart, transform): - url = self.BASE_URL + urlpart - async with self._session.get(url) as resp: - text = await resp.text() - soup = bs4.BeautifulSoup(text, "html.parser") + async def _crawl(self, orga, urlpart, transform): + url = self.BASE_URL + urlpart + async with self._session.get(url) as resp: + text = await resp.text() + soup = bs4.BeautifulSoup(text, "html.parser") - for found in soup.find_all("a", href=self.LINK_RE): - url = found["href"] - filename = re.match(self.LINK_RE, url).group(1).replace("/", ".") - logger.debug(f"Found file {filename} at {url}") + for found in soup.find_all("a", href=self.LINK_RE): + url = found["href"] + filename = re.match(self.LINK_RE, url).group(1).replace("/", ".") + logger.debug(f"Found file {filename} at {url}") - old_path = pathlib.PurePath(filename) - new_path = transform(old_path) - if new_path is None: - continue - logger.debug(f"Transformed from {old_path} to {new_path}") + old_path = pathlib.PurePath(filename) + new_path = transform(old_path) + if new_path is None: + continue + logger.debug(f"Transformed from {old_path} to {new_path}") - temp_path = orga.temp_file() - await self._download(url, temp_path) - orga.add_file(temp_path, new_path) + temp_path = orga.temp_file() + await self._download(url, temp_path) + orga.add_file(temp_path, new_path) - async def _download(self, url, to_path): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url) as resp: - await utils.stream_to_path(resp, to_path) - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) - else: - return - else: - logger.error(f"Could not download {url}") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + async def _download(self, url, to_path): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url) as resp: + await utils.stream_to_path(resp, to_path) + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) + else: + return + else: + logger.error(f"Could not download {url}") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") diff --git a/PFERD/ilias.py b/PFERD/ilias.py index 57551e4..aace379 100644 --- a/PFERD/ilias.py +++ b/PFERD/ilias.py @@ -12,105 +12,105 @@ from .ilias_authenticators import ShibbolethAuthenticator from . import utils __all__ = [ - "ILIAS", + "ILIAS", ] logger = logging.getLogger(__name__) class ILIAS: - FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") - DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)") + FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") + DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)") - def __init__(self, base_path, cookie_file): - self.base_path = base_path + def __init__(self, base_path, cookie_file): + self.base_path = base_path - self._auth = ShibbolethAuthenticator(base_path / cookie_file) + self._auth = ShibbolethAuthenticator(base_path / cookie_file) - async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): - logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.") + async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): + logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.") - sync_path = pathlib.Path(self.base_path, to_dir) - orga = Organizer(self.base_path, sync_path) + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) - orga.clean_temp_dir() + orga.clean_temp_dir() - files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) - await self._download(orga, files, transform) + files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) + await self._download(orga, files, transform) - orga.clean_sync_dir() - orga.clean_temp_dir() + orga.clean_sync_dir() + orga.clean_temp_dir() - async def close(self): - await self._auth.close() + async def close(self): + await self._auth.close() - async def _crawl(self, dir_path, dir_id, filter_): - soup = await self._auth.get_webpage(dir_id) + async def _crawl(self, dir_path, dir_id, filter_): + soup = await self._auth.get_webpage(dir_id) - found_files = [] + found_files = [] - files = self._find_files(soup) - for (name, file_id) in files: - path = dir_path / name - found_files.append((path, file_id)) - logger.debug(f"Found file {path}") + files = self._find_files(soup) + for (name, file_id) in files: + path = dir_path / name + found_files.append((path, file_id)) + logger.debug(f"Found file {path}") - dirs = self._find_dirs(soup) - for (name, ref_id) in dirs: - path = dir_path / name - logger.debug(f"Found dir {path}") - if filter_(path): - logger.info(f"Searching {path}") - files = await self._crawl(path, ref_id, filter_) - found_files.extend(files) - else: - logger.info(f"Not searching {path}") + dirs = self._find_dirs(soup) + for (name, ref_id) in dirs: + path = dir_path / name + logger.debug(f"Found dir {path}") + if filter_(path): + logger.info(f"Searching {path}") + files = await self._crawl(path, ref_id, filter_) + found_files.extend(files) + else: + logger.info(f"Not searching {path}") - return found_files + return found_files - async def _download(self, orga, files, transform): - for (path, file_id) in sorted(files): - to_path = transform(path) - if to_path is not None: - temp_path = orga.temp_file() - await self._auth.download_file(file_id, temp_path) - orga.add_file(temp_path, to_path) + async def _download(self, orga, files, transform): + for (path, file_id) in sorted(files): + to_path = transform(path) + if to_path is not None: + temp_path = orga.temp_file() + await self._auth.download_file(file_id, temp_path) + orga.add_file(temp_path, to_path) - def _find_files(self, soup): - files = [] - file_names = set() + def _find_files(self, soup): + files = [] + file_names = set() - found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) - for element in found: - file_stem = element.string.strip().replace("/", ".") - file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() - file_id = re.search(self.FILE_RE, element.get("href")).group(1) + found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) + for element in found: + file_stem = element.string.strip().replace("/", ".") + file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() + file_id = re.search(self.FILE_RE, element.get("href")).group(1) - file_name = f"{file_stem}.{file_type}" - if file_name in file_names: - counter = 1 - while True: - file_name = f"{file_stem} (duplicate {counter}).{file_type}" - if file_name in file_names: - counter += 1 - else: - break + file_name = f"{file_stem}.{file_type}" + if file_name in file_names: + counter = 1 + while True: + file_name = f"{file_stem} (duplicate {counter}).{file_type}" + if file_name in file_names: + counter += 1 + else: + break - files.append((file_name, file_id)) - file_names.add(file_name) + files.append((file_name, file_id)) + file_names.add(file_name) - return files + return files - def _find_dirs(self, soup): - dirs = [] + def _find_dirs(self, soup): + dirs = [] - found = soup.find_all("div", {"class": "alert", "role": "alert"}) - if found: - return [] + found = soup.find_all("div", {"class": "alert", "role": "alert"}) + if found: + return [] - found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) - for element in found: - dir_name = element.string.strip().replace("/", ".") - ref_id = re.search(self.DIR_RE, element.get("href")).group(1) - dir_id = f"fold_{ref_id}" - dirs.append((dir_name, dir_id)) + found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) + for element in found: + dir_name = element.string.strip().replace("/", ".") + ref_id = re.search(self.DIR_RE, element.get("href")).group(1) + dir_id = f"fold_{ref_id}" + dirs.append((dir_name, dir_id)) - return dirs + return dirs diff --git a/PFERD/ilias_authenticators.py b/PFERD/ilias_authenticators.py index 4d51d03..d686c7a 100644 --- a/PFERD/ilias_authenticators.py +++ b/PFERD/ilias_authenticators.py @@ -19,211 +19,211 @@ from .read_write_lock import ReadWriteLock from . import utils __all__ = [ - "ShibbolethAuthenticator", + "ShibbolethAuthenticator", ] logger = logging.getLogger(__name__) class ShibbolethAuthenticator: - ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" + ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" - RETRY_ATTEMPTS = 5 - RETRY_DELAY = 1 # seconds - CHUNK_SIZE = 1024**2 + RETRY_ATTEMPTS = 5 + RETRY_DELAY = 1 # seconds + CHUNK_SIZE = 1024**2 - ALLOWED_CONTENT_TYPES = [ - "application/pdf", - "application/zip", - "text/xml", - "text/plain", - "image/jpeg", - "image/png", - ] + ALLOWED_CONTENT_TYPES = [ + "application/pdf", + "application/zip", + "text/xml", + "text/plain", + "image/jpeg", + "image/png", + ] - def __init__(self, cookie_path=None): - self._cookie_path = cookie_path + def __init__(self, cookie_path=None): + self._cookie_path = cookie_path - # Authentication and file/page download should not happen at the same time. - # Authenticating counts as writing, file/page downloads as reading. - self._lock = ReadWriteLock() + # Authentication and file/page download should not happen at the same time. + # Authenticating counts as writing, file/page downloads as reading. + self._lock = ReadWriteLock() - # Only one self._authenticate() should be started, even if multiple self.get_page()s - # notice they're logged in. - # If self._event is not None, authenticating is currently in progress. - self._event = None + # Only one self._authenticate() should be started, even if multiple self.get_page()s + # notice they're logged in. + # If self._event is not None, authenticating is currently in progress. + self._event = None - jar = aiohttp.CookieJar() - if self._cookie_path is not None: - try: - jar.load(self._cookie_path) - except FileNotFoundError: - pass - self._session = aiohttp.ClientSession(cookie_jar=jar) + jar = aiohttp.CookieJar() + if self._cookie_path is not None: + try: + jar.load(self._cookie_path) + except FileNotFoundError: + pass + self._session = aiohttp.ClientSession(cookie_jar=jar) - async def close(self): - await self._session.close() + async def close(self): + await self._session.close() - async def _post(self, url, params=None, data=None): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.post(url, params=params, data=data) as resp: - text = await resp.text() - return resp.url, text - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) + async def _post(self, url, params=None, data=None): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.post(url, params=params, data=data) as resp: + text = await resp.text() + return resp.url, text + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) - logger.error(f"Could not POST {url} params:{params} data:{data}.") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + logger.error(f"Could not POST {url} params:{params} data:{data}.") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") - async def _get(self, url, params=None): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url, params=params) as resp: - text = await resp.text() - return resp.url, text - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) + async def _get(self, url, params=None): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url, params=params) as resp: + text = await resp.text() + return resp.url, text + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) - logger.error(f"Could not GET {url} params:{params}.") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + logger.error(f"Could not GET {url} params:{params}.") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") - def _login_successful(self, soup): - saml_response = soup.find("input", {"name": "SAMLResponse"}) - relay_state = soup.find("input", {"name": "RelayState"}) - return saml_response is not None and relay_state is not None + def _login_successful(self, soup): + saml_response = soup.find("input", {"name": "SAMLResponse"}) + relay_state = soup.find("input", {"name": "RelayState"}) + return saml_response is not None and relay_state is not None - def _save_cookies(self): - logger.info(f"Saving cookies to {self._cookie_path}") - if self._cookie_path is not None: - self._session.cookie_jar.save(self._cookie_path) + def _save_cookies(self): + logger.info(f"Saving cookies to {self._cookie_path}") + if self._cookie_path is not None: + self._session.cookie_jar.save(self._cookie_path) - # WARNING: Only use self._ensure_authenticated() to authenticate, - # don't call self._authenticate() itself. - async def _authenticate(self): - async with self._lock.write(): - # Equivalent: Click on "Mit KIT-Account anmelden" button in - # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" - data = { - "sendLogin": "1", - "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", - "target": "/shib_login.php", - "home_organization_selection": "Mit KIT-Account anmelden", - } - logger.debug("Begin authentication process with ILIAS") - url, text = await self._post(url, data=data) - soup = bs4.BeautifulSoup(text, "html.parser") + # WARNING: Only use self._ensure_authenticated() to authenticate, + # don't call self._authenticate() itself. + async def _authenticate(self): + async with self._lock.write(): + # Equivalent: Click on "Mit KIT-Account anmelden" button in + # https://ilias.studium.kit.edu/login.php + url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + data = { + "sendLogin": "1", + "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", + "target": "/shib_login.php", + "home_organization_selection": "Mit KIT-Account anmelden", + } + logger.debug("Begin authentication process with ILIAS") + url, text = await self._post(url, data=data) + soup = bs4.BeautifulSoup(text, "html.parser") - # Attempt to login using credentials, if necessary - while not self._login_successful(soup): - form = soup.find("form", {"class": "form2", "method": "post"}) - action = form["action"] + # Attempt to login using credentials, if necessary + while not self._login_successful(soup): + form = soup.find("form", {"class": "form2", "method": "post"}) + action = form["action"] - print("Please enter Shibboleth credentials.") - username = getpass.getpass(prompt="Username: ") - password = getpass.getpass(prompt="Password: ") + print("Please enter Shibboleth credentials.") + username = getpass.getpass(prompt="Username: ") + password = getpass.getpass(prompt="Password: ") - # Equivalent: Enter credentials in - # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO - url = "https://idp.scc.kit.edu" + action - data = { - "_eventId_proceed": "", - "j_username": username, - "j_password": password, - } - logger.debug("Attempt to log in to Shibboleth using credentials") - url, text = await self._post(url, data=data) - soup = bs4.BeautifulSoup(text, "html.parser") + # Equivalent: Enter credentials in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + url = "https://idp.scc.kit.edu" + action + data = { + "_eventId_proceed": "", + "j_username": username, + "j_password": password, + } + logger.debug("Attempt to log in to Shibboleth using credentials") + url, text = await self._post(url, data=data) + soup = bs4.BeautifulSoup(text, "html.parser") - if not self._login_successful(soup): - print("Incorrect credentials.") + if not self._login_successful(soup): + print("Incorrect credentials.") - # Saving progress: Successfully authenticated with Shibboleth - self._save_cookies() + # Saving progress: Successfully authenticated with Shibboleth + self._save_cookies() - relay_state = soup.find("input", {"name": "RelayState"})["value"] - saml_response = soup.find("input", {"name": "SAMLResponse"})["value"] + relay_state = soup.find("input", {"name": "RelayState"})["value"] + saml_response = soup.find("input", {"name": "SAMLResponse"})["value"] - # Equivalent: Being redirected via JS automatically - # (or clicking "Continue" if you have JS disabled) - url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" - data = { - "RelayState": relay_state, - "SAMLResponse": saml_response, - } - logger.debug("Redirect back to ILIAS with login information") - url, text = await self._post(url, data=data) + # Equivalent: Being redirected via JS automatically + # (or clicking "Continue" if you have JS disabled) + url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + data = { + "RelayState": relay_state, + "SAMLResponse": saml_response, + } + logger.debug("Redirect back to ILIAS with login information") + url, text = await self._post(url, data=data) - # Saving progress: Successfully authenticated with Ilias - self._save_cookies() + # Saving progress: Successfully authenticated with Ilias + self._save_cookies() - async def _ensure_authenticated(self): - if self._event is None: - self._event = asyncio.Event() - logger.info("Not logged in, authentication required.") - await self._authenticate() - self._event.set() - self._event = None - else: - await self._event.wait() + async def _ensure_authenticated(self): + if self._event is None: + self._event = asyncio.Event() + logger.info("Not logged in, authentication required.") + await self._authenticate() + self._event.set() + self._event = None + else: + await self._event.wait() - def _is_logged_in(self, soup): - userlog = soup.find("li", {"id": "userlog"}) - return userlog is not None + def _is_logged_in(self, soup): + userlog = soup.find("li", {"id": "userlog"}) + return userlog is not None - async def get_webpage_refid(self, ref_id): - return await self.get_webpage(f"fold_{ref_id}") + async def get_webpage_refid(self, ref_id): + return await self.get_webpage(f"fold_{ref_id}") - async def get_webpage(self, object_id): - params = {"target": object_id} + async def get_webpage(self, object_id): + params = {"target": object_id} - while True: - async with self._lock.read(): - logger.debug(f"Getting {self.ILIAS_GOTO} {params}") - _, text = await self._get(self.ILIAS_GOTO, params=params) - soup = bs4.BeautifulSoup(text, "html.parser") + while True: + async with self._lock.read(): + logger.debug(f"Getting {self.ILIAS_GOTO} {params}") + _, text = await self._get(self.ILIAS_GOTO, params=params) + soup = bs4.BeautifulSoup(text, "html.parser") - if self._is_logged_in(soup): - return soup - else: - await self._ensure_authenticated() + if self._is_logged_in(soup): + return soup + else: + await self._ensure_authenticated() - async def _download(self, url, params, to_path): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url, params=params) as resp: - if resp.content_type in self.ALLOWED_CONTENT_TYPES: - # Yay, we got the file (as long as it's a PDF) - await utils.stream_to_path(resp, to_path) - return True - elif resp.content_type == "text/html": - # Dangit, we're probably not logged in. - text = await resp.text() - soup = bs4.BeautifulSoup(text, "html.parser") - if self._is_logged_in(soup): - raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).") - return False - else: - # What *did* we get? - raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.") + async def _download(self, url, params, to_path): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url, params=params) as resp: + if resp.content_type in self.ALLOWED_CONTENT_TYPES: + # Yay, we got the file (as long as it's a PDF) + await utils.stream_to_path(resp, to_path) + return True + elif resp.content_type == "text/html": + # Dangit, we're probably not logged in. + text = await resp.text() + soup = bs4.BeautifulSoup(text, "html.parser") + if self._is_logged_in(soup): + raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).") + return False + else: + # What *did* we get? + raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.") - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) - logger.error(f"Could not download {url} params:{params}.") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + logger.error(f"Could not download {url} params:{params}.") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") - async def download_file(self, file_id, to_path): - params = {"target": file_id} + async def download_file(self, file_id, to_path): + params = {"target": file_id} - while True: - async with self._lock.read(): - success = await self._download(self.ILIAS_GOTO, params, to_path) + while True: + async with self._lock.read(): + success = await self._download(self.ILIAS_GOTO, params, to_path) - if success: - return - else: - await self._ensure_authenticated() + if success: + return + else: + await self._ensure_authenticated() diff --git a/PFERD/norbert.py b/PFERD/norbert.py index fa1f566..8e29dd2 100644 --- a/PFERD/norbert.py +++ b/PFERD/norbert.py @@ -12,113 +12,113 @@ from .organizer import Organizer from . import utils __all__ = [ - "Norbert", + "Norbert", ] logger = logging.getLogger(__name__) class Norbert: - BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" - LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") + BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" + LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") - RETRY_ATTEMPTS = 5 - RETRY_DELAY = 1 # seconds + RETRY_ATTEMPTS = 5 + RETRY_DELAY = 1 # seconds - def __init__(self, base_path): - self.base_path = base_path + def __init__(self, base_path): + self.base_path = base_path - self._session = aiohttp.ClientSession() + self._session = aiohttp.ClientSession() - async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): - logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") + async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): + logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") - sync_path = pathlib.Path(self.base_path, to_dir) - orga = Organizer(self.base_path, sync_path) + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) - orga.clean_temp_dir() + orga.clean_temp_dir() - files = await self._crawl() - await self._download(orga, files, transform, unzip) + files = await self._crawl() + await self._download(orga, files, transform, unzip) - orga.clean_sync_dir() - orga.clean_temp_dir() + orga.clean_sync_dir() + orga.clean_temp_dir() - async def close(self): - await self._session.close() + async def close(self): + await self._session.close() - async def _crawl(self): - url = self.BASE_URL - async with self._session.get(url) as resp: - raw = await resp.read() - # replace undecodeable characters with a placeholder - text = raw.decode("utf-8", "replace") - soup = bs4.BeautifulSoup(text, "html.parser") + async def _crawl(self): + url = self.BASE_URL + async with self._session.get(url) as resp: + raw = await resp.read() + # replace undecodeable characters with a placeholder + text = raw.decode("utf-8", "replace") + soup = bs4.BeautifulSoup(text, "html.parser") - files = [] + files = [] - for found in soup.find_all("a", href=self.LINK_RE): - url = found["href"] - full_url = self.BASE_URL + url + for found in soup.find_all("a", href=self.LINK_RE): + url = found["href"] + full_url = self.BASE_URL + url - filename = re.search(self.LINK_RE, url).group(1) - path = pathlib.PurePath(filename) + filename = re.search(self.LINK_RE, url).group(1) + path = pathlib.PurePath(filename) - logger.debug(f"Found zip file {filename} at {full_url}") + logger.debug(f"Found zip file {filename} at {full_url}") - files.append((path, full_url)) + files.append((path, full_url)) - return files + return files - async def _download(self, orga, files, transform, unzip): - for path, url in sorted(files): - # Yes, we want the zip file contents - if unzip(path): - logger.debug(f"Downloading and unzipping {path}") - zip_path = utils.rename(path, path.stem) + async def _download(self, orga, files, transform, unzip): + for path, url in sorted(files): + # Yes, we want the zip file contents + if unzip(path): + logger.debug(f"Downloading and unzipping {path}") + zip_path = utils.rename(path, path.stem) - # Download zip file - temp_file = orga.temp_file() - await self._download_zip(url, temp_file) + # Download zip file + temp_file = orga.temp_file() + await self._download_zip(url, temp_file) - # Search the zip file for files to extract - temp_dir = orga.temp_dir() - with zipfile.ZipFile(temp_file, "r") as zf: - for info in zf.infolist(): - # Only interested in the files themselves, the directory - # structure is created automatically by orga.add_file() - if info.is_dir(): - continue + # Search the zip file for files to extract + temp_dir = orga.temp_dir() + with zipfile.ZipFile(temp_file, "r") as zf: + for info in zf.infolist(): + # Only interested in the files themselves, the directory + # structure is created automatically by orga.add_file() + if info.is_dir(): + continue - file_path = zip_path / pathlib.PurePath(info.filename) - logger.debug(f"Found {info.filename} at path {file_path}") + file_path = zip_path / pathlib.PurePath(info.filename) + logger.debug(f"Found {info.filename} at path {file_path}") - new_path = transform(file_path) - if new_path is not None: - # Extract to temp file and add, the usual deal - temp_file = orga.temp_file() - extracted_path = zf.extract(info, temp_dir) - extracted_path = pathlib.Path(extracted_path) - orga.add_file(extracted_path, new_path) + new_path = transform(file_path) + if new_path is not None: + # Extract to temp file and add, the usual deal + temp_file = orga.temp_file() + extracted_path = zf.extract(info, temp_dir) + extracted_path = pathlib.Path(extracted_path) + orga.add_file(extracted_path, new_path) - # No, we only want the zip file itself - else: - logger.debug(f"Only downloading {path}") + # No, we only want the zip file itself + else: + logger.debug(f"Only downloading {path}") - new_path = transform(path) - if new_path is not None: - temp_file = orga.temp_file() - await self._download_zip(url, temp_file) - orga.add_file(temp_file, new_path) + new_path = transform(path) + if new_path is not None: + temp_file = orga.temp_file() + await self._download_zip(url, temp_file) + orga.add_file(temp_file, new_path) - async def _download_zip(self, url, to_path): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url) as resp: - await utils.stream_to_path(resp, to_path) - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) - else: - return - else: - logger.error(f"Could not download {url}") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + async def _download_zip(self, url, to_path): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url) as resp: + await utils.stream_to_path(resp, to_path) + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) + else: + return + else: + logger.error(f"Could not download {url}") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") diff --git a/PFERD/organizer.py b/PFERD/organizer.py index b0845f4..140409e 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -6,135 +6,135 @@ import shutil from . import utils __all__ = [ - "Organizer", + "Organizer", ] logger = logging.getLogger(__name__) class Organizer: - def __init__(self, base_dir, sync_dir): - """ - base_dir - the .tmp directory will be created here - sync_dir - synced files will be moved here - Both are expected to be concrete pathlib paths. - """ + def __init__(self, base_dir, sync_dir): + """ + base_dir - the .tmp directory will be created here + sync_dir - synced files will be moved here + Both are expected to be concrete pathlib paths. + """ - self._base_dir = base_dir - self._sync_dir = sync_dir + self._base_dir = base_dir + self._sync_dir = sync_dir - self._temp_dir = pathlib.Path(self._base_dir, ".tmp") - self._temp_nr = 0 + self._temp_dir = pathlib.Path(self._base_dir, ".tmp") + self._temp_nr = 0 - # check if base/sync dir exist? + # check if base/sync dir exist? - self._added_files = set() + self._added_files = set() - def clean_temp_dir(self): - if self._temp_dir.exists(): - shutil.rmtree(self._temp_dir) - self._temp_dir.mkdir(exist_ok=True) - logger.debug(f"Cleaned temp dir: {self._temp_dir}") + def clean_temp_dir(self): + if self._temp_dir.exists(): + shutil.rmtree(self._temp_dir) + self._temp_dir.mkdir(exist_ok=True) + logger.debug(f"Cleaned temp dir: {self._temp_dir}") - def temp_dir(self): - nr = self._temp_nr - self._temp_nr += 1 - temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() - logger.debug(f"Produced new temp dir: {temp_dir}") - return temp_dir + def temp_dir(self): + nr = self._temp_nr + self._temp_nr += 1 + temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() + logger.debug(f"Produced new temp dir: {temp_dir}") + return temp_dir - def temp_file(self): - # generate the path to a new temp file in base_path/.tmp/ - # make sure no two paths are the same - nr = self._temp_nr - self._temp_nr += 1 - temp_file = pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve() - logger.debug(f"Produced new temp file: {temp_file}") - return temp_file + def temp_file(self): + # generate the path to a new temp file in base_path/.tmp/ + # make sure no two paths are the same + nr = self._temp_nr + self._temp_nr += 1 + temp_file = pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve() + logger.debug(f"Produced new temp file: {temp_file}") + return temp_file - def add_file(self, from_path, to_path): - if not from_path.exists(): - raise utils.FileNotFoundException(f"Could not add file at {from_path}") + def add_file(self, from_path, to_path): + if not from_path.exists(): + raise utils.FileNotFoundException(f"Could not add file at {from_path}") - # check if sync_dir/to_path is inside sync_dir? - to_path = pathlib.Path(self._sync_dir, to_path) + # check if sync_dir/to_path is inside sync_dir? + to_path = pathlib.Path(self._sync_dir, to_path) - if to_path.exists() and to_path.is_dir(): - if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): - shutil.rmtree(to_path) - else: - logger.warn(f"Could not add file {to_path}") - return + if to_path.exists() and to_path.is_dir(): + if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): + shutil.rmtree(to_path) + else: + logger.warn(f"Could not add file {to_path}") + return - if to_path.exists(): - if filecmp.cmp(from_path, to_path, shallow=False): - logger.info(f"Ignored {to_path}") + if to_path.exists(): + if filecmp.cmp(from_path, to_path, shallow=False): + logger.info(f"Ignored {to_path}") - # remember path for later reference - self._added_files.add(to_path.resolve()) - logger.debug(f"Added file {to_path.resolve()}") + # remember path for later reference + self._added_files.add(to_path.resolve()) + logger.debug(f"Added file {to_path.resolve()}") - # No further action needed, especially not overwriting symlinks... - return - else: - logger.info(f"Different file at {to_path}") - else: - logger.info(f"New file at {to_path}") + # No further action needed, especially not overwriting symlinks... + return + else: + logger.info(f"Different file at {to_path}") + else: + logger.info(f"New file at {to_path}") - # copy the file from from_path to sync_dir/to_path - # If the file being replaced was a symlink, the link itself is overwritten, - # not the file the link points to. - to_path.parent.mkdir(parents=True, exist_ok=True) - from_path.replace(to_path) - logger.debug(f"Moved {from_path} to {to_path}") + # copy the file from from_path to sync_dir/to_path + # If the file being replaced was a symlink, the link itself is overwritten, + # not the file the link points to. + to_path.parent.mkdir(parents=True, exist_ok=True) + from_path.replace(to_path) + logger.debug(f"Moved {from_path} to {to_path}") - # remember path for later reference, after the new file was written - # This is necessary here because otherwise, resolve() would resolve the symlink too. - self._added_files.add(to_path.resolve()) - logger.debug(f"Added file {to_path.resolve()}") + # remember path for later reference, after the new file was written + # This is necessary here because otherwise, resolve() would resolve the symlink too. + self._added_files.add(to_path.resolve()) + logger.debug(f"Added file {to_path.resolve()}") - def clean_sync_dir(self): - self._clean_dir(self._sync_dir, remove_parent=False) - logger.debug(f"Cleaned sync dir: {self._sync_dir}") + def clean_sync_dir(self): + self._clean_dir(self._sync_dir, remove_parent=False) + logger.debug(f"Cleaned sync dir: {self._sync_dir}") - def _clean_dir(self, path, remove_parent=True): - for child in sorted(path.iterdir()): - logger.debug(f"Looking at {child.resolve()}") - if child.is_dir(): - self._clean_dir(child, remove_parent=True) - elif child.resolve() not in self._added_files: - if self._prompt_yes_no(f"Delete {child}?", default=False): - child.unlink() - logger.debug(f"Deleted {child}") + def _clean_dir(self, path, remove_parent=True): + for child in sorted(path.iterdir()): + logger.debug(f"Looking at {child.resolve()}") + if child.is_dir(): + self._clean_dir(child, remove_parent=True) + elif child.resolve() not in self._added_files: + if self._prompt_yes_no(f"Delete {child}?", default=False): + child.unlink() + logger.debug(f"Deleted {child}") - if remove_parent: - try: - path.rmdir() - except OSError: # directory not empty - pass + if remove_parent: + try: + path.rmdir() + except OSError: # directory not empty + pass - def _prompt_yes_no(self, question, default=None): - if default is True: - prompt = "[Y/n]" - elif default is False: - prompt = "[y/N]" - else: - prompt = "[y/n]" + def _prompt_yes_no(self, question, default=None): + if default is True: + prompt = "[Y/n]" + elif default is False: + prompt = "[y/N]" + else: + prompt = "[y/n]" - text = f"{question} {prompt} " - WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'." + text = f"{question} {prompt} " + WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'." - while True: - response = input(text).strip().lower() - if response in {"yes", "ye", "y"}: - return True - elif response in {"no", "n"}: - return False - elif response == "": - if default is None: - print(WRONG_REPLY) - else: - return default - else: - print(WRONG_REPLY) + while True: + response = input(text).strip().lower() + if response in {"yes", "ye", "y"}: + return True + elif response in {"no", "n"}: + return False + elif response == "": + if default is None: + print(WRONG_REPLY) + else: + return default + else: + print(WRONG_REPLY) # How to use: # diff --git a/PFERD/utils.py b/PFERD/utils.py index a4088f4..acdfe58 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -2,39 +2,39 @@ import os import pathlib __all__ = [ - "get_base_dir", - "move", - "rename", - "stream_to_path", - "OutOfTriesException", - "UnknownFileTypeException", - "FileNotFoundException", + "get_base_dir", + "move", + "rename", + "stream_to_path", + "OutOfTriesException", + "UnknownFileTypeException", + "FileNotFoundException", ] def get_base_dir(script_file): - return pathlib.Path(os.path.dirname(os.path.abspath(script_file))) + return pathlib.Path(os.path.dirname(os.path.abspath(script_file))) def move(path, from_folders, to_folders): - l = len(from_folders) - if path.parts[:l] == from_folders: - return pathlib.PurePath(*to_folders, *path.parts[l:]) + l = len(from_folders) + if path.parts[:l] == from_folders: + return pathlib.PurePath(*to_folders, *path.parts[l:]) def rename(path, to_name): - return pathlib.PurePath(*path.parts[:-1], to_name) + return pathlib.PurePath(*path.parts[:-1], to_name) async def stream_to_path(resp, to_path, chunk_size=1024**2): - with open(to_path, 'wb') as fd: - while True: - chunk = await resp.content.read(chunk_size) - if not chunk: - break - fd.write(chunk) + with open(to_path, 'wb') as fd: + while True: + chunk = await resp.content.read(chunk_size) + if not chunk: + break + fd.write(chunk) class OutOfTriesException(Exception): - pass + pass class UnknownFileTypeException(Exception): - pass + pass class FileNotFoundException(Exception): - pass + pass