From dfddc9303968a68f613e1f73b136ac5d5a0bc0d9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 25 Apr 2019 19:15:36 +0000 Subject: [PATCH] Move norbert from aiohttp to requests Also fix streaming (when downloading) in the other classes. --- PFERD/__init__.py | 4 +-- PFERD/ffm.py | 2 +- PFERD/ilias_authenticators.py | 1 - PFERD/norbert.py | 59 +++++++++++++---------------------- 4 files changed, 25 insertions(+), 41 deletions(-) diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 59e2a85..d7db00b 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,12 +1,12 @@ from .ffm import * from .ilias import * -#from .norbert import * +from .norbert import * from .utils import * __all__ = [] __all__ += ffm.__all__ __all__ += ilias.__all__ -#__all__ += norbert.__all__ +__all__ += norbert.__all__ __all__ += utils.__all__ LOG_FORMAT = "[%(levelname)s] %(message)s" diff --git a/PFERD/ffm.py b/PFERD/ffm.py index d039ea4..7150bbb 100644 --- a/PFERD/ffm.py +++ b/PFERD/ffm.py @@ -56,5 +56,5 @@ class FfM: orga.add_file(temp_path, new_path) def _download(self, url, to_path): - with self._session.get(url) as r: + with self._session.get(url, stream=True) as r: stream_to_path(r, to_path) diff --git a/PFERD/ilias_authenticators.py b/PFERD/ilias_authenticators.py index e923038..33766da 100644 --- a/PFERD/ilias_authenticators.py +++ b/PFERD/ilias_authenticators.py @@ -22,7 +22,6 @@ logger = logging.getLogger(__name__) class ShibbolethAuthenticator: ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" - CHUNK_SIZE = 1024**2 ALLOWED_CONTENT_TYPES = [ "application/pdf", diff --git a/PFERD/norbert.py b/PFERD/norbert.py index 8e29dd2..8c7514b 100644 --- a/PFERD/norbert.py +++ b/PFERD/norbert.py @@ -1,15 +1,15 @@ # Norberts Prog-Tuts -import aiohttp -import asyncio -import bs4 import logging import pathlib import re import zipfile +import bs4 +import requests + from .organizer import Organizer -from . import utils +from .utils import rename, stream_to_path __all__ = [ "Norbert", @@ -20,15 +20,12 @@ class Norbert: BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") - RETRY_ATTEMPTS = 5 - RETRY_DELAY = 1 # seconds - def __init__(self, base_path): self.base_path = base_path - self._session = aiohttp.ClientSession() + self._session = requests.Session() - async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): + def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") sync_path = pathlib.Path(self.base_path, to_dir) @@ -36,21 +33,20 @@ class Norbert: orga.clean_temp_dir() - files = await self._crawl() - await self._download(orga, files, transform, unzip) + files = self._crawl() + self._download(orga, files, transform, unzip) orga.clean_sync_dir() orga.clean_temp_dir() - async def close(self): - await self._session.close() - - async def _crawl(self): + def _crawl(self): url = self.BASE_URL - async with self._session.get(url) as resp: - raw = await resp.read() - # replace undecodeable characters with a placeholder - text = raw.decode("utf-8", "replace") + r = self._session.get(url) + + # replace undecodeable characters with a placeholder + #text = r.raw.decode("utf-8", "replace") + + text = r.text soup = bs4.BeautifulSoup(text, "html.parser") files = [] @@ -63,21 +59,20 @@ class Norbert: path = pathlib.PurePath(filename) logger.debug(f"Found zip file {filename} at {full_url}") - files.append((path, full_url)) return files - async def _download(self, orga, files, transform, unzip): + def _download(self, orga, files, transform, unzip): for path, url in sorted(files): # Yes, we want the zip file contents if unzip(path): logger.debug(f"Downloading and unzipping {path}") - zip_path = utils.rename(path, path.stem) + zip_path = rename(path, path.stem) # Download zip file temp_file = orga.temp_file() - await self._download_zip(url, temp_file) + self._download_zip(url, temp_file) # Search the zip file for files to extract temp_dir = orga.temp_dir() @@ -106,19 +101,9 @@ class Norbert: new_path = transform(path) if new_path is not None: temp_file = orga.temp_file() - await self._download_zip(url, temp_file) + self._download_zip(url, temp_file) orga.add_file(temp_file, new_path) - async def _download_zip(self, url, to_path): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url) as resp: - await utils.stream_to_path(resp, to_path) - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) - else: - return - else: - logger.error(f"Could not download {url}") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + def _download_zip(self, url, to_path): + with self._session.get(url, stream=True) as r: + stream_to_path(r, to_path)