pferd/PFERD/ffm.py
Joscha 2034c9d426 Add FfM (Fachschaft für Mathematik) synchronizer
This commit moves exceptions and some other things into utils.py and
renames files according to python's file naming guides (kinda).

It also adds a new example config using the new FfM downloader.
2018-11-24 08:27:33 +00:00

80 lines
2.1 KiB
Python

# Fakultät für Mathematik (FfM)
import aiohttp
import asyncio
import bs4
import logging
import pathlib
import re
from .organizer import Organizer
from . import utils
__all__ = [
"FfM",
]
logger = logging.getLogger(__name__)
class FfM:
BASE_URL = "http://www.math.kit.edu/"
LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$")
RETRY_ATTEMPTS = 5
RETRY_DELAY = 1 # seconds
def __init__(self, base_path):
self.base_path = base_path
self._session = aiohttp.ClientSession()
async def synchronize(self, urlpart, to_dir, transform=lambda x: x):
logging.info(f"Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path)
orga.clean_temp_dir()
await self._crawl(orga, urlpart, transform)
orga.clean_sync_dir()
orga.clean_temp_dir()
async def close(self):
await self._session.close()
async def _crawl(self, orga, urlpart, transform):
url = self.BASE_URL + urlpart
async with self._session.get(url) as resp:
text = await resp.text()
soup = bs4.BeautifulSoup(text, "html.parser")
for found in soup.find_all("a", href=self.LINK_RE):
url = found["href"]
filename = re.match(self.LINK_RE, url).group(1)
logger.debug(f"Found file {filename} at {url}")
old_path = pathlib.PurePath(filename)
new_path = transform(old_path)
if new_path is None:
continue
logger.debug(f"Transformed from {old_path} to {new_path}")
temp_path = orga.temp_file()
await self._download(url, temp_path)
orga.add_file(temp_path, new_path)
async def _download(self, url, to_path):
for t in range(self.RETRY_ATTEMPTS):
try:
async with self._session.get(url) as resp:
await utils.stream_to_path(resp, to_path)
except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY)
else:
return
else:
logger.error(f"Could not download {url}")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")