From 2034c9d426bdf0ac08e71d70989351cf5eb7770d Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 24 Nov 2018 08:27:33 +0000 Subject: [PATCH] =?UTF-8?q?Add=20FfM=20(Fachschaft=20f=C3=BCr=20Mathematik?= =?UTF-8?q?)=20synchronizer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit moves exceptions and some other things into utils.py and renames files according to python's file naming guides (kinda). It also adds a new example config using the new FfM downloader. --- PFERD/__init__.py | 11 ++- PFERD/ffm.py | 79 +++++++++++++++++++ ...henticators.py => ilias_authenticators.py} | 31 ++------ PFERD/organizer.py | 10 +-- .../{ReadWriteLock.py => read_write_lock.py} | 0 PFERD/utils.py | 29 +++++++ example_config.py | 29 +++++++ 7 files changed, 156 insertions(+), 33 deletions(-) create mode 100644 PFERD/ffm.py rename PFERD/{IliasAuthenticators.py => ilias_authenticators.py} (88%) rename PFERD/{ReadWriteLock.py => read_write_lock.py} (100%) create mode 100644 PFERD/utils.py create mode 100644 example_config.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index a7d808f..b580005 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,9 +1,14 @@ -from .IliasAuthenticators import * +from .ffm import * +from .ilias_authenticators import * from .organizer import * +from .utils import * __all__ = ( - IliasAuthenticators.__all__ + - organizer.__all__ + ffm.__all__ + + ilias_authenticators.__all__ + + organizer.__all__ + + utils.__all__ + + [] ) LOG_FORMAT = "[%(levelname)s] %(message)s" diff --git a/PFERD/ffm.py b/PFERD/ffm.py new file mode 100644 index 0000000..1122b72 --- /dev/null +++ b/PFERD/ffm.py @@ -0,0 +1,79 @@ +# Fakultät für Mathematik (FfM) + +import aiohttp +import asyncio +import bs4 +import logging +import pathlib +import re + +from .organizer import Organizer +from . import utils + +__all__ = [ + "FfM", +] +logger = logging.getLogger(__name__) + +class FfM: + BASE_URL = "http://www.math.kit.edu/" + LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") + + RETRY_ATTEMPTS = 5 + RETRY_DELAY = 1 # seconds + + def __init__(self, base_path): + self.base_path = base_path + + self._session = aiohttp.ClientSession() + + async def synchronize(self, urlpart, to_dir, transform=lambda x: x): + logging.info(f"Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") + + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) + + orga.clean_temp_dir() + + await self._crawl(orga, urlpart, transform) + + orga.clean_sync_dir() + orga.clean_temp_dir() + + async def close(self): + await self._session.close() + + async def _crawl(self, orga, urlpart, transform): + url = self.BASE_URL + urlpart + async with self._session.get(url) as resp: + text = await resp.text() + soup = bs4.BeautifulSoup(text, "html.parser") + + for found in soup.find_all("a", href=self.LINK_RE): + url = found["href"] + filename = re.match(self.LINK_RE, url).group(1) + logger.debug(f"Found file {filename} at {url}") + + old_path = pathlib.PurePath(filename) + new_path = transform(old_path) + if new_path is None: + continue + logger.debug(f"Transformed from {old_path} to {new_path}") + + temp_path = orga.temp_file() + await self._download(url, temp_path) + orga.add_file(temp_path, new_path) + + async def _download(self, url, to_path): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url) as resp: + await utils.stream_to_path(resp, to_path) + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) + else: + return + else: + logger.error(f"Could not download {url}") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") diff --git a/PFERD/IliasAuthenticators.py b/PFERD/ilias_authenticators.py similarity index 88% rename from PFERD/IliasAuthenticators.py rename to PFERD/ilias_authenticators.py index b1aca1f..120ef96 100644 --- a/PFERD/IliasAuthenticators.py +++ b/PFERD/ilias_authenticators.py @@ -15,21 +15,14 @@ import logging import time import urllib.parse -from .ReadWriteLock import ReadWriteLock +from .read_write_lock import ReadWriteLock +from . import utils __all__ = [ - "OutOfTriesException", - "UnknownFileTypeException", "ShibbolethAuthenticator", ] logger = logging.getLogger(__name__) -class OutOfTriesException(Exception): - pass - -class UnknownFileTypeException(Exception): - pass - class ShibbolethAuthenticator: ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" @@ -72,7 +65,7 @@ class ShibbolethAuthenticator: await asyncio.sleep(self.RETRY_DELAY) logger.error(f"Could not POST {url} params:{params} data:{data}.") - raise OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") async def _get(self, url, params=None): for t in range(self.RETRY_ATTEMPTS): @@ -85,7 +78,7 @@ class ShibbolethAuthenticator: await asyncio.sleep(self.RETRY_DELAY) logger.error(f"Could not GET {url} params:{params}.") - raise OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") def _login_successful(self, soup): saml_response = soup.find("input", {"name": "SAMLResponse"}) @@ -188,39 +181,31 @@ class ShibbolethAuthenticator: else: await self._ensure_authenticated() - async def _stream_to_path(self, resp, to_path): - with open(to_path, 'wb') as fd: - while True: - chunk = await resp.content.read(self.CHUNK_SIZE) - if not chunk: - break - fd.write(chunk) - async def _download(self, url, params, to_path): for t in range(self.RETRY_ATTEMPTS): try: async with self._session.get(url, params=params) as resp: if resp.content_type == "application/pdf": # Yay, we got the file (as long as it's a PDF) - await self._stream_to_path(resp, to_path) + await utils.stream_to_path(resp, to_path) return True elif resp.content_type == "text/html": # Dangit, we're probably not logged in. text = await resp.text() soup = bs4.BeautifulSoup(text, "html.parser") if self._is_logged_in(soup): - raise UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).") + raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).") return False else: # What *did* we get? - raise UnknownFileTypeException(f"Unknown file of type {resp.content_type}.") + raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.") except aiohttp.client_exceptions.ServerDisconnectedError: logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") await asyncio.sleep(self.RETRY_DELAY) logger.error(f"Could not download {url} params:{params}.") - raise OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") async def download_file(self, file_id, to_path): params = {"target": file_id} diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 6ec032a..95482b4 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -3,18 +3,14 @@ import logging import pathlib import shutil +from . import utils + __all__ = [ - "FileNotFoundException", "Organizer", ] logger = logging.getLogger(__name__) -class FileNotFoundException(Exception): - pass - class Organizer: - HASH_BUF_SIZE = 1024**2 - def __init__(self, base_dir, sync_dir): """ base_dir - the .tmp directory will be created here @@ -49,7 +45,7 @@ class Organizer: def add_file(self, from_path, to_path): if not from_path.exists(): - raise FileNotFoundException(f"Could not add file at {from_path}") + raise utils.FileNotFoundException(f"Could not add file at {from_path}") # check if sync_dir/to_path is inside sync_dir? to_path = pathlib.Path(self._sync_dir, to_path) diff --git a/PFERD/ReadWriteLock.py b/PFERD/read_write_lock.py similarity index 100% rename from PFERD/ReadWriteLock.py rename to PFERD/read_write_lock.py diff --git a/PFERD/utils.py b/PFERD/utils.py new file mode 100644 index 0000000..bd82468 --- /dev/null +++ b/PFERD/utils.py @@ -0,0 +1,29 @@ +import os + +__all__ = [ + "get_base_dir", + "stream_to_path", + "OutOfTriesException", + "UnknownFileTypeException", + "FileNotFoundException", +] + +def get_base_dir(script_file): + return os.path.dirname(os.path.abspath(script_file)) + +async def stream_to_path(resp, to_path, chunk_size=1024**2): + with open(to_path, 'wb') as fd: + while True: + chunk = await resp.content.read(chunk_size) + if not chunk: + break + fd.write(chunk) + +class OutOfTriesException(Exception): + pass + +class UnknownFileTypeException(Exception): + pass + +class FileNotFoundException(Exception): + pass diff --git a/example_config.py b/example_config.py new file mode 100644 index 0000000..02c8bb6 --- /dev/null +++ b/example_config.py @@ -0,0 +1,29 @@ +import PFERD +import asyncio +import logging +import pathlib + +logging.basicConfig(level=logging.INFO, format=PFERD.LOG_FORMAT) + +base_dir = PFERD.get_base_dir(__file__) + +def hm1(old_path): + if old_path.match("blatt*.pdf"): + return pathlib.PurePath("Blätter", old_path.name) + + return old_path + +def ana1(old_path): + if old_path.match("*zettel*.pdf"): + return pathlib.PurePath("Blätter", old_path.name) + + return old_path + +async def main(): + ffm = PFERD.FfM(base_dir) + await ffm.synchronize("iana2/lehre/hm1info2018w/de", "HM1", transform=hm1) + await ffm.synchronize("iana1/lehre/ana12018w/de", "Ana1", transform=ana1) + await ffm.close() + +if __name__ == "__main__": + asyncio.run(main())