Try out some HTTP authentication handling

This is by no means final yet and will change a bit once the dl and cl
are changed, but it might serve as a first try. It is also wholly
untested.
This commit is contained in:
I-Al-Istannen 2021-05-21 12:02:51 +02:00
parent 83d12fcf2d
commit 4b104b6252
2 changed files with 69 additions and 22 deletions

View File

@ -1,3 +1,4 @@
import asyncio
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from datetime import datetime from datetime import datetime
@ -265,6 +266,33 @@ class HttpCrawler(Crawler):
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE) self._output_dir.register_reserved(self.COOKIE_FILE)
self._authentication_id = 0
self._authentication_lock = asyncio.Lock()
async def prepare_request(self) -> int:
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
# This should reduce the amount of requests we make: If an authentication is in progress
# all future requests wait for authentication to complete.
async with self._authentication_lock:
return self._authentication_id
async def authenticate(self, current_id: int) -> None:
async with self._authentication_lock:
# Another thread successfully called authenticate in between
# We do not want to perform auth again, so return here. We can
# assume auth suceeded as authenticate will throw an error if
# it failed.
if current_id != self._authentication_id:
return
await self._authenticate()
self._authentication_id += 1
async def _authenticate(self) -> None:
"""
Performs authentication. This method must only return normally if authentication suceeded.
In all other cases it mus either retry internally or throw a terminal exception.
"""
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
async def run(self) -> None: async def run(self) -> None:
cookie_jar = aiohttp.CookieJar() cookie_jar = aiohttp.CookieJar()

View File

@ -5,14 +5,15 @@ from pathlib import PurePath
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
import aiohttp import aiohttp
from aiohttp import hdrs
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from rich.markup import escape from rich.markup import escape
from PFERD.authenticators import Authenticator from PFERD.authenticators import Authenticator
from PFERD.config import Config from PFERD.config import Config
from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
from PFERD.logging import log from PFERD.logging import ProgressBar, log
from PFERD.output_dir import Redownload from PFERD.output_dir import FileSink, Redownload
from PFERD.utils import soupify, url_set_query_param from PFERD.utils import soupify, url_set_query_param
from .file_templates import link_template_plain, link_template_rich from .file_templates import link_template_plain, link_template_rich
@ -232,23 +233,24 @@ class KitIliasWebCrawler(HttpCrawler):
page = IliasPage(await self._get_page(element.url), element.url, element) page = IliasPage(await self._get_page(element.url), element.url, element)
real_element = page.get_child_elements()[0] real_element = page.get_child_elements()[0]
async with dl as sink, self.session.get(real_element.url) as resp: async with dl as sink:
if resp.content_length: await self._stream_from_url(real_element.url, sink, bar)
bar.set_total(resp.content_length)
async for data in resp.content.iter_chunked(1024):
sink.file.write(data)
bar.advance(len(data))
sink.done()
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
dl = await self.download(element_path, mtime=element.mtime) dl = await self.download(element_path, mtime=element.mtime)
if not dl: if not dl:
return return
async with self.download_bar(element_path) as bar: async with self.download_bar(element_path) as bar, dl as sink:
async with dl as sink, self.session.get(element.url) as resp: await self._stream_from_url(element.url, sink, bar)
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async def try_stream() -> bool:
async with self.session.get(url, allow_redirects=False) as resp:
# Redirect means we weren't authenticated
if hdrs.LOCATION in resp.headers:
return False
if resp.content_length: if resp.content_length:
bar.set_total(resp.content_length) bar.set_total(resp.content_length)
@ -257,22 +259,39 @@ class KitIliasWebCrawler(HttpCrawler):
bar.advance(len(data)) bar.advance(len(data))
sink.done() sink.done()
return True
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: auth_id = await self.prepare_request()
# This function will retry itself a few times if it is not logged in - it won't handle if await try_stream():
# connection errors return
if retries_left < 0:
# TODO: Proper exception await self.authenticate(auth_id)
raise RuntimeError("Get page failed too often")
print(url, "retries left", retries_left) if not await try_stream():
raise CrawlError("File streaming failed after authenticate()")
async def _get_page(self, url: str) -> BeautifulSoup:
auth_id = await self.prepare_request()
async with self.session.get(url) as request: async with self.session.get(url) as request:
soup = soupify(await request.read()) soup = soupify(await request.read())
if self._is_logged_in(soup): if self._is_logged_in(soup):
return soup return soup
await self._shibboleth_login.login(self.session) # We weren't authenticated, so try to do that
await self.authenticate(auth_id)
return await self._get_page(url, retries_left - 1) # Retry once after authenticating. If this fails, we will die.
async with self.session.get(url) as request:
soup = soupify(await request.read())
if self._is_logged_in(soup):
return soup
raise CrawlError("get_page failed even after authenticating")
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
@_iorepeat(3, "Login")
async def _authenticate(self) -> None:
await self._shibboleth_login.login(self.session)
@staticmethod @staticmethod
def _is_logged_in(soup: BeautifulSoup) -> bool: def _is_logged_in(soup: BeautifulSoup) -> bool: