From 82adeb324f1f9f1f6722d4e7a25192c26d5e403c Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 25 Apr 2019 19:01:53 +0000 Subject: [PATCH] Move ffm stuff from aiohttp to requests --- PFERD/__init__.py | 4 ++-- PFERD/ffm.py | 53 +++++++++++++++-------------------------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 232a361..59e2a85 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,10 +1,10 @@ -#from .ffm import * +from .ffm import * from .ilias import * #from .norbert import * from .utils import * __all__ = [] -#__all__ += ffm.__all__ +__all__ += ffm.__all__ __all__ += ilias.__all__ #__all__ += norbert.__all__ __all__ += utils.__all__ diff --git a/PFERD/ffm.py b/PFERD/ffm.py index 54890e9..d039ea4 100644 --- a/PFERD/ffm.py +++ b/PFERD/ffm.py @@ -1,53 +1,44 @@ # Fakultät für Mathematik (FfM) -import aiohttp -import asyncio -import bs4 import logging import pathlib import re -from .organizer import Organizer -from . import utils +import bs4 +import requests -__all__ = [ - "FfM", -] +from .organizer import Organizer +from .utils import stream_to_path + +__all__ = ["FfM"] logger = logging.getLogger(__name__) class FfM: BASE_URL = "http://www.math.kit.edu/" LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") - RETRY_ATTEMPTS = 5 - RETRY_DELAY = 1 # seconds - def __init__(self, base_path): self.base_path = base_path - self._session = aiohttp.ClientSession() + self._session = requests.Session() - async def synchronize(self, urlpart, to_dir, transform=lambda x: x): + def synchronize(self, urlpart, to_dir, transform=lambda x: x): logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") sync_path = pathlib.Path(self.base_path, to_dir) - orga = Organizer(self.base_path, sync_path) + orga = Organizer(self.base_path, sync_path) orga.clean_temp_dir() - await self._crawl(orga, urlpart, transform) + self._crawl(orga, urlpart, transform) orga.clean_sync_dir() orga.clean_temp_dir() - async def close(self): - await self._session.close() - - async def _crawl(self, orga, urlpart, transform): + def _crawl(self, orga, urlpart, transform): url = self.BASE_URL + urlpart - async with self._session.get(url) as resp: - text = await resp.text() - soup = bs4.BeautifulSoup(text, "html.parser") + r = self._session.get(url) + soup = bs4.BeautifulSoup(r.text, "html.parser") for found in soup.find_all("a", href=self.LINK_RE): url = found["href"] @@ -61,19 +52,9 @@ class FfM: logger.debug(f"Transformed from {old_path} to {new_path}") temp_path = orga.temp_file() - await self._download(url, temp_path) + self._download(url, temp_path) orga.add_file(temp_path, new_path) - async def _download(self, url, to_path): - for t in range(self.RETRY_ATTEMPTS): - try: - async with self._session.get(url) as resp: - await utils.stream_to_path(resp, to_path) - except aiohttp.client_exceptions.ServerDisconnectedError: - logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") - await asyncio.sleep(self.RETRY_DELAY) - else: - return - else: - logger.error(f"Could not download {url}") - raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") + def _download(self, url, to_path): + with self._session.get(url) as r: + stream_to_path(r, to_path)