From 4b104b6252cb5ee97481c0842564922757482f85 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 21 May 2021 12:02:51 +0200 Subject: [PATCH] Try out some HTTP authentication handling This is by no means final yet and will change a bit once the dl and cl are changed, but it might serve as a first try. It is also wholly untested. --- PFERD/crawler.py | 28 +++++++++ PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 63 ++++++++++++------- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 80ecedb..2f8e5ad 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,3 +1,4 @@ +import asyncio from abc import ABC, abstractmethod from contextlib import asynccontextmanager from datetime import datetime @@ -265,6 +266,33 @@ class HttpCrawler(Crawler): self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) self._output_dir.register_reserved(self.COOKIE_FILE) + self._authentication_id = 0 + self._authentication_lock = asyncio.Lock() + + async def prepare_request(self) -> int: + # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. + # This should reduce the amount of requests we make: If an authentication is in progress + # all future requests wait for authentication to complete. + async with self._authentication_lock: + return self._authentication_id + + async def authenticate(self, current_id: int) -> None: + async with self._authentication_lock: + # Another thread successfully called authenticate in between + # We do not want to perform auth again, so return here. We can + # assume auth suceeded as authenticate will throw an error if + # it failed. + if current_id != self._authentication_id: + return + await self._authenticate() + self._authentication_id += 1 + + async def _authenticate(self) -> None: + """ + Performs authentication. This method must only return normally if authentication suceeded. + In all other cases it mus either retry internally or throw a terminal exception. + """ + raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") async def run(self) -> None: cookie_jar = aiohttp.CookieJar() diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 88732c0..0ca6565 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -5,14 +5,15 @@ from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union import aiohttp +from aiohttp import hdrs from bs4 import BeautifulSoup, Tag from rich.markup import escape from PFERD.authenticators import Authenticator from PFERD.config import Config from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical -from PFERD.logging import log -from PFERD.output_dir import Redownload +from PFERD.logging import ProgressBar, log +from PFERD.output_dir import FileSink, Redownload from PFERD.utils import soupify, url_set_query_param from .file_templates import link_template_plain, link_template_rich @@ -232,23 +233,24 @@ class KitIliasWebCrawler(HttpCrawler): page = IliasPage(await self._get_page(element.url), element.url, element) real_element = page.get_child_elements()[0] - async with dl as sink, self.session.get(real_element.url) as resp: - if resp.content_length: - bar.set_total(resp.content_length) - - async for data in resp.content.iter_chunked(1024): - sink.file.write(data) - bar.advance(len(data)) - - sink.done() + async with dl as sink: + await self._stream_from_url(real_element.url, sink, bar) async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: return - async with self.download_bar(element_path) as bar: - async with dl as sink, self.session.get(element.url) as resp: + async with self.download_bar(element_path) as bar, dl as sink: + await self._stream_from_url(element.url, sink, bar) + + async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async def try_stream() -> bool: + async with self.session.get(url, allow_redirects=False) as resp: + # Redirect means we weren't authenticated + if hdrs.LOCATION in resp.headers: + return False + if resp.content_length: bar.set_total(resp.content_length) @@ -257,22 +259,39 @@ class KitIliasWebCrawler(HttpCrawler): bar.advance(len(data)) sink.done() + return True - async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: - # This function will retry itself a few times if it is not logged in - it won't handle - # connection errors - if retries_left < 0: - # TODO: Proper exception - raise RuntimeError("Get page failed too often") - print(url, "retries left", retries_left) + auth_id = await self.prepare_request() + if await try_stream(): + return + + await self.authenticate(auth_id) + + if not await try_stream(): + raise CrawlError("File streaming failed after authenticate()") + + async def _get_page(self, url: str) -> BeautifulSoup: + auth_id = await self.prepare_request() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): return soup - await self._shibboleth_login.login(self.session) + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) - return await self._get_page(url, retries_left - 1) + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url) as request: + soup = soupify(await request.read()) + if self._is_logged_in(soup): + return soup + raise CrawlError("get_page failed even after authenticating") + + # We repeat this as the login method in shibboleth doesn't handle I/O errors. + # Shibboleth is quite reliable as well, the repeat is likely not critical here. + @_iorepeat(3, "Login") + async def _authenticate(self) -> None: + await self._shibboleth_login.login(self.session) @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: