Move norbert from aiohttp to requests

Also fix streaming (when downloading) in the other classes.
This commit is contained in:
Joscha 2019-04-25 19:15:36 +00:00
parent f0c42ce8ec
commit dfddc93039
4 changed files with 25 additions and 41 deletions

View File

@ -1,12 +1,12 @@
from .ffm import * from .ffm import *
from .ilias import * from .ilias import *
#from .norbert import * from .norbert import *
from .utils import * from .utils import *
__all__ = [] __all__ = []
__all__ += ffm.__all__ __all__ += ffm.__all__
__all__ += ilias.__all__ __all__ += ilias.__all__
#__all__ += norbert.__all__ __all__ += norbert.__all__
__all__ += utils.__all__ __all__ += utils.__all__
LOG_FORMAT = "[%(levelname)s] %(message)s" LOG_FORMAT = "[%(levelname)s] %(message)s"

View File

@ -56,5 +56,5 @@ class FfM:
orga.add_file(temp_path, new_path) orga.add_file(temp_path, new_path)
def _download(self, url, to_path): def _download(self, url, to_path):
with self._session.get(url) as r: with self._session.get(url, stream=True) as r:
stream_to_path(r, to_path) stream_to_path(r, to_path)

View File

@ -22,7 +22,6 @@ logger = logging.getLogger(__name__)
class ShibbolethAuthenticator: class ShibbolethAuthenticator:
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
CHUNK_SIZE = 1024**2
ALLOWED_CONTENT_TYPES = [ ALLOWED_CONTENT_TYPES = [
"application/pdf", "application/pdf",

View File

@ -1,15 +1,15 @@
# Norberts Prog-Tuts # Norberts Prog-Tuts
import aiohttp
import asyncio
import bs4
import logging import logging
import pathlib import pathlib
import re import re
import zipfile import zipfile
import bs4
import requests
from .organizer import Organizer from .organizer import Organizer
from . import utils from .utils import rename, stream_to_path
__all__ = [ __all__ = [
"Norbert", "Norbert",
@ -20,15 +20,12 @@ class Norbert:
BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/"
LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$")
RETRY_ATTEMPTS = 5
RETRY_DELAY = 1 # seconds
def __init__(self, base_path): def __init__(self, base_path):
self.base_path = base_path self.base_path = base_path
self._session = aiohttp.ClientSession() self._session = requests.Session()
async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True):
logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
@ -36,21 +33,20 @@ class Norbert:
orga.clean_temp_dir() orga.clean_temp_dir()
files = await self._crawl() files = self._crawl()
await self._download(orga, files, transform, unzip) self._download(orga, files, transform, unzip)
orga.clean_sync_dir() orga.clean_sync_dir()
orga.clean_temp_dir() orga.clean_temp_dir()
async def close(self): def _crawl(self):
await self._session.close()
async def _crawl(self):
url = self.BASE_URL url = self.BASE_URL
async with self._session.get(url) as resp: r = self._session.get(url)
raw = await resp.read()
# replace undecodeable characters with a placeholder # replace undecodeable characters with a placeholder
text = raw.decode("utf-8", "replace") #text = r.raw.decode("utf-8", "replace")
text = r.text
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
files = [] files = []
@ -63,21 +59,20 @@ class Norbert:
path = pathlib.PurePath(filename) path = pathlib.PurePath(filename)
logger.debug(f"Found zip file {filename} at {full_url}") logger.debug(f"Found zip file {filename} at {full_url}")
files.append((path, full_url)) files.append((path, full_url))
return files return files
async def _download(self, orga, files, transform, unzip): def _download(self, orga, files, transform, unzip):
for path, url in sorted(files): for path, url in sorted(files):
# Yes, we want the zip file contents # Yes, we want the zip file contents
if unzip(path): if unzip(path):
logger.debug(f"Downloading and unzipping {path}") logger.debug(f"Downloading and unzipping {path}")
zip_path = utils.rename(path, path.stem) zip_path = rename(path, path.stem)
# Download zip file # Download zip file
temp_file = orga.temp_file() temp_file = orga.temp_file()
await self._download_zip(url, temp_file) self._download_zip(url, temp_file)
# Search the zip file for files to extract # Search the zip file for files to extract
temp_dir = orga.temp_dir() temp_dir = orga.temp_dir()
@ -106,19 +101,9 @@ class Norbert:
new_path = transform(path) new_path = transform(path)
if new_path is not None: if new_path is not None:
temp_file = orga.temp_file() temp_file = orga.temp_file()
await self._download_zip(url, temp_file) self._download_zip(url, temp_file)
orga.add_file(temp_file, new_path) orga.add_file(temp_file, new_path)
async def _download_zip(self, url, to_path): def _download_zip(self, url, to_path):
for t in range(self.RETRY_ATTEMPTS): with self._session.get(url, stream=True) as r:
try: stream_to_path(r, to_path)
async with self._session.get(url) as resp:
await utils.stream_to_path(resp, to_path)
except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY)
else:
return
else:
logger.error(f"Could not download {url}")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")