From 2d9223b8e6f7ab74b78f347879c68890a919e93a Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Nov 2018 09:28:44 +0000 Subject: [PATCH] Add norbert synchronizer --- PFERD/__init__.py | 2 + PFERD/norbert.py | 124 +++++++++++++++++++++++++++++++++++++++++++++ PFERD/organizer.py | 13 +++++ example_config.py | 4 ++ 4 files changed, 143 insertions(+) create mode 100644 PFERD/norbert.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 978aed7..3937db4 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,10 +1,12 @@ from .ffm import * from .ilias import * +from .norbert import * from .utils import * __all__ = ( ffm.__all__ + ilias.__all__ + + norbert.__all__ + utils.__all__ + [] ) diff --git a/PFERD/norbert.py b/PFERD/norbert.py new file mode 100644 index 0000000..cf86eaa --- /dev/null +++ b/PFERD/norbert.py @@ -0,0 +1,124 @@ +# Norberts Prog-Tuts + +import aiohttp +import asyncio +import bs4 +import logging +import pathlib +import re +import zipfile + +from .organizer import Organizer +from . import utils + +__all__ = [ + "Norbert", +] +logger = logging.getLogger(__name__) + +class Norbert: + BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" + LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") + + RETRY_ATTEMPTS = 5 + RETRY_DELAY = 1 # seconds + + def __init__(self, base_path): + self.base_path = base_path + + self._session = aiohttp.ClientSession() + + async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): + logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") + + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) + + orga.clean_temp_dir() + + files = await self._crawl() + await self._download(orga, files, transform, unzip) + + orga.clean_sync_dir() + orga.clean_temp_dir() + + async def close(self): + await self._session.close() + + async def _crawl(self): + url = self.BASE_URL + async with self._session.get(url) as resp: + raw = await resp.read() + # replace undecodeable characters with a placeholder + text = raw.decode("utf-8", "replace") + soup = bs4.BeautifulSoup(text, "html.parser") + + files = [] + + for found in soup.find_all("a", href=self.LINK_RE): + url = found["href"] + full_url = self.BASE_URL + url + + filename = re.search(self.LINK_RE, url).group(1) + path = pathlib.PurePath(filename) + + logger.debug(f"Found zip file {filename} at {full_url}") + + files.append((path, full_url)) + + return files + + async def _download(self, orga, files, transform, unzip): + for path, url in files: + # Yes, we want the zip file contents + if unzip(path): + logger.debug(f"Downloading and unzipping {path}") + zip_path = utils.rename(path, path.stem) + + # Download zip file + temp_file = orga.temp_file() + await self._download_zip(url, temp_file) + + # Search the zip file for files to extract + temp_dir = orga.temp_dir() + with zipfile.ZipFile(temp_file, "r") as zf: + for info in zf.infolist(): + # Only interested in the files themselves, the directory + # structure is created automatically by orga.add_file() + if info.is_dir(): + continue + + file_path = zip_path / pathlib.PurePath(info.filename) + logger.debug(f"Found {info.filename} at path {file_path}") + + new_path = transform(file_path) + if new_path is not None: + # Extract to temp file and add, the usual deal + temp_file = orga.temp_file() + extracted_path = zf.extract(info, temp_dir) + extracted_path = pathlib.Path(extracted_path) + orga.add_file(extracted_path, new_path) + + # No, we only want the zip file itself + else: + logger.debug(f"Only downloading {path}") + + new_path = transform(path) + if new_path is not None: + temp_file = orga.temp_file() + await self._download_zip(url, temp_file) + orga.add_file(temp_file, new_path) + + async def _download_zip(self, url, to_path): + for t in range(self.RETRY_ATTEMPTS): + try: + async with self._session.get(url) as resp: + await utils.stream_to_path(resp, to_path) + except aiohttp.client_exceptions.ServerDisconnectedError: + logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") + await asyncio.sleep(self.RETRY_DELAY) + else: + return + else: + logger.error(f"Could not download {url}") + raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 7f5c7ad..b0845f4 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -34,6 +34,13 @@ class Organizer: self._temp_dir.mkdir(exist_ok=True) logger.debug(f"Cleaned temp dir: {self._temp_dir}") + def temp_dir(self): + nr = self._temp_nr + self._temp_nr += 1 + temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() + logger.debug(f"Produced new temp dir: {temp_dir}") + return temp_dir + def temp_file(self): # generate the path to a new temp file in base_path/.tmp/ # make sure no two paths are the same @@ -50,6 +57,12 @@ class Organizer: # check if sync_dir/to_path is inside sync_dir? to_path = pathlib.Path(self._sync_dir, to_path) + if to_path.exists() and to_path.is_dir(): + if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): + shutil.rmtree(to_path) + else: + logger.warn(f"Could not add file {to_path}") + return if to_path.exists(): if filecmp.cmp(from_path, to_path, shallow=False): diff --git a/example_config.py b/example_config.py index 2057a24..2d1be45 100644 --- a/example_config.py +++ b/example_config.py @@ -147,6 +147,7 @@ async def main(args): ffm = PFERD.FfM(base_dir) ilias = PFERD.ILIAS(base_dir, "cookie_jar") + norbert = PFERD.Norbert(base_dir) if not args or "gbi" in args: await ilias.synchronize("855240", "GBI", transform=gbi_transform, filter=gbi_filter) @@ -156,9 +157,12 @@ async def main(args): await ilias.synchronize("874938", "LA1", transform=la1_transform, filter=la1_filter) if not args or "prog" in args: await ilias.synchronize("851237", "Prog", transform=prog_transform, filter=prog_filter) + if not args or "norbert" in args: + await norbert.synchronize("Prog-Tut") await ffm.close() await ilias.close() + await norbert.close() if __name__ == "__main__": args = sys.argv[1:]