mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Try out some HTTP authentication handling
This is by no means final yet and will change a bit once the dl and cl are changed, but it might serve as a first try. It is also wholly untested.
This commit is contained in:
parent
83d12fcf2d
commit
4b104b6252
@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -265,6 +266,33 @@ class HttpCrawler(Crawler):
|
|||||||
|
|
||||||
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
||||||
self._output_dir.register_reserved(self.COOKIE_FILE)
|
self._output_dir.register_reserved(self.COOKIE_FILE)
|
||||||
|
self._authentication_id = 0
|
||||||
|
self._authentication_lock = asyncio.Lock()
|
||||||
|
|
||||||
|
async def prepare_request(self) -> int:
|
||||||
|
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
|
||||||
|
# This should reduce the amount of requests we make: If an authentication is in progress
|
||||||
|
# all future requests wait for authentication to complete.
|
||||||
|
async with self._authentication_lock:
|
||||||
|
return self._authentication_id
|
||||||
|
|
||||||
|
async def authenticate(self, current_id: int) -> None:
|
||||||
|
async with self._authentication_lock:
|
||||||
|
# Another thread successfully called authenticate in between
|
||||||
|
# We do not want to perform auth again, so return here. We can
|
||||||
|
# assume auth suceeded as authenticate will throw an error if
|
||||||
|
# it failed.
|
||||||
|
if current_id != self._authentication_id:
|
||||||
|
return
|
||||||
|
await self._authenticate()
|
||||||
|
self._authentication_id += 1
|
||||||
|
|
||||||
|
async def _authenticate(self) -> None:
|
||||||
|
"""
|
||||||
|
Performs authentication. This method must only return normally if authentication suceeded.
|
||||||
|
In all other cases it mus either retry internally or throw a terminal exception.
|
||||||
|
"""
|
||||||
|
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
|
||||||
|
|
||||||
async def run(self) -> None:
|
async def run(self) -> None:
|
||||||
cookie_jar = aiohttp.CookieJar()
|
cookie_jar = aiohttp.CookieJar()
|
||||||
|
@ -5,14 +5,15 @@ from pathlib import PurePath
|
|||||||
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
|
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from aiohttp import hdrs
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
from PFERD.authenticators import Authenticator
|
from PFERD.authenticators import Authenticator
|
||||||
from PFERD.config import Config
|
from PFERD.config import Config
|
||||||
from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
|
from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
|
||||||
from PFERD.logging import log
|
from PFERD.logging import ProgressBar, log
|
||||||
from PFERD.output_dir import Redownload
|
from PFERD.output_dir import FileSink, Redownload
|
||||||
from PFERD.utils import soupify, url_set_query_param
|
from PFERD.utils import soupify, url_set_query_param
|
||||||
|
|
||||||
from .file_templates import link_template_plain, link_template_rich
|
from .file_templates import link_template_plain, link_template_rich
|
||||||
@ -232,23 +233,24 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
page = IliasPage(await self._get_page(element.url), element.url, element)
|
page = IliasPage(await self._get_page(element.url), element.url, element)
|
||||||
real_element = page.get_child_elements()[0]
|
real_element = page.get_child_elements()[0]
|
||||||
|
|
||||||
async with dl as sink, self.session.get(real_element.url) as resp:
|
async with dl as sink:
|
||||||
if resp.content_length:
|
await self._stream_from_url(real_element.url, sink, bar)
|
||||||
bar.set_total(resp.content_length)
|
|
||||||
|
|
||||||
async for data in resp.content.iter_chunked(1024):
|
|
||||||
sink.file.write(data)
|
|
||||||
bar.advance(len(data))
|
|
||||||
|
|
||||||
sink.done()
|
|
||||||
|
|
||||||
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
|
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||||
dl = await self.download(element_path, mtime=element.mtime)
|
dl = await self.download(element_path, mtime=element.mtime)
|
||||||
if not dl:
|
if not dl:
|
||||||
return
|
return
|
||||||
|
|
||||||
async with self.download_bar(element_path) as bar:
|
async with self.download_bar(element_path) as bar, dl as sink:
|
||||||
async with dl as sink, self.session.get(element.url) as resp:
|
await self._stream_from_url(element.url, sink, bar)
|
||||||
|
|
||||||
|
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
|
async def try_stream() -> bool:
|
||||||
|
async with self.session.get(url, allow_redirects=False) as resp:
|
||||||
|
# Redirect means we weren't authenticated
|
||||||
|
if hdrs.LOCATION in resp.headers:
|
||||||
|
return False
|
||||||
|
|
||||||
if resp.content_length:
|
if resp.content_length:
|
||||||
bar.set_total(resp.content_length)
|
bar.set_total(resp.content_length)
|
||||||
|
|
||||||
@ -257,22 +259,39 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
bar.advance(len(data))
|
bar.advance(len(data))
|
||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
return True
|
||||||
|
|
||||||
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
|
auth_id = await self.prepare_request()
|
||||||
# This function will retry itself a few times if it is not logged in - it won't handle
|
if await try_stream():
|
||||||
# connection errors
|
return
|
||||||
if retries_left < 0:
|
|
||||||
# TODO: Proper exception
|
await self.authenticate(auth_id)
|
||||||
raise RuntimeError("Get page failed too often")
|
|
||||||
print(url, "retries left", retries_left)
|
if not await try_stream():
|
||||||
|
raise CrawlError("File streaming failed after authenticate()")
|
||||||
|
|
||||||
|
async def _get_page(self, url: str) -> BeautifulSoup:
|
||||||
|
auth_id = await self.prepare_request()
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
await self._shibboleth_login.login(self.session)
|
# We weren't authenticated, so try to do that
|
||||||
|
await self.authenticate(auth_id)
|
||||||
|
|
||||||
return await self._get_page(url, retries_left - 1)
|
# Retry once after authenticating. If this fails, we will die.
|
||||||
|
async with self.session.get(url) as request:
|
||||||
|
soup = soupify(await request.read())
|
||||||
|
if self._is_logged_in(soup):
|
||||||
|
return soup
|
||||||
|
raise CrawlError("get_page failed even after authenticating")
|
||||||
|
|
||||||
|
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
|
||||||
|
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
|
||||||
|
@_iorepeat(3, "Login")
|
||||||
|
async def _authenticate(self) -> None:
|
||||||
|
await self._shibboleth_login.login(self.session)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_logged_in(soup: BeautifulSoup) -> bool:
|
def _is_logged_in(soup: BeautifulSoup) -> bool:
|
||||||
|
Loading…
Reference in New Issue
Block a user