mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-04 14:42:49 +01:00 
			
		
		
		
	Try out some HTTP authentication handling
This is by no means final yet and will change a bit once the dl and cl are changed, but it might serve as a first try. It is also wholly untested.
This commit is contained in:
		@@ -1,3 +1,4 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
from abc import ABC, abstractmethod
 | 
			
		||||
from contextlib import asynccontextmanager
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
@@ -265,6 +266,33 @@ class HttpCrawler(Crawler):
 | 
			
		||||
 | 
			
		||||
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
 | 
			
		||||
        self._output_dir.register_reserved(self.COOKIE_FILE)
 | 
			
		||||
        self._authentication_id = 0
 | 
			
		||||
        self._authentication_lock = asyncio.Lock()
 | 
			
		||||
 | 
			
		||||
    async def prepare_request(self) -> int:
 | 
			
		||||
        # We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
 | 
			
		||||
        # This should reduce the amount of requests we make: If an authentication is in progress
 | 
			
		||||
        # all future requests wait for authentication to complete.
 | 
			
		||||
        async with self._authentication_lock:
 | 
			
		||||
            return self._authentication_id
 | 
			
		||||
 | 
			
		||||
    async def authenticate(self, current_id: int) -> None:
 | 
			
		||||
        async with self._authentication_lock:
 | 
			
		||||
            # Another thread successfully called authenticate in between
 | 
			
		||||
            # We do not want to perform auth again, so return here. We can
 | 
			
		||||
            # assume auth suceeded as authenticate will throw an error if
 | 
			
		||||
            # it failed.
 | 
			
		||||
            if current_id != self._authentication_id:
 | 
			
		||||
                return
 | 
			
		||||
            await self._authenticate()
 | 
			
		||||
            self._authentication_id += 1
 | 
			
		||||
 | 
			
		||||
    async def _authenticate(self) -> None:
 | 
			
		||||
        """
 | 
			
		||||
        Performs authentication. This method must only return normally if authentication suceeded.
 | 
			
		||||
        In all other cases it mus either retry internally or throw a terminal exception.
 | 
			
		||||
        """
 | 
			
		||||
        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
 | 
			
		||||
 | 
			
		||||
    async def run(self) -> None:
 | 
			
		||||
        cookie_jar = aiohttp.CookieJar()
 | 
			
		||||
 
 | 
			
		||||
@@ -5,14 +5,15 @@ from pathlib import PurePath
 | 
			
		||||
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
 | 
			
		||||
 | 
			
		||||
import aiohttp
 | 
			
		||||
from aiohttp import hdrs
 | 
			
		||||
from bs4 import BeautifulSoup, Tag
 | 
			
		||||
from rich.markup import escape
 | 
			
		||||
 | 
			
		||||
from PFERD.authenticators import Authenticator
 | 
			
		||||
from PFERD.config import Config
 | 
			
		||||
from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
 | 
			
		||||
from PFERD.logging import log
 | 
			
		||||
from PFERD.output_dir import Redownload
 | 
			
		||||
from PFERD.logging import ProgressBar, log
 | 
			
		||||
from PFERD.output_dir import FileSink, Redownload
 | 
			
		||||
from PFERD.utils import soupify, url_set_query_param
 | 
			
		||||
 | 
			
		||||
from .file_templates import link_template_plain, link_template_rich
 | 
			
		||||
@@ -232,23 +233,24 @@ class KitIliasWebCrawler(HttpCrawler):
 | 
			
		||||
            page = IliasPage(await self._get_page(element.url), element.url, element)
 | 
			
		||||
            real_element = page.get_child_elements()[0]
 | 
			
		||||
 | 
			
		||||
            async with dl as sink, self.session.get(real_element.url) as resp:
 | 
			
		||||
                if resp.content_length:
 | 
			
		||||
                    bar.set_total(resp.content_length)
 | 
			
		||||
 | 
			
		||||
                async for data in resp.content.iter_chunked(1024):
 | 
			
		||||
                    sink.file.write(data)
 | 
			
		||||
                    bar.advance(len(data))
 | 
			
		||||
 | 
			
		||||
                sink.done()
 | 
			
		||||
            async with dl as sink:
 | 
			
		||||
                await self._stream_from_url(real_element.url, sink, bar)
 | 
			
		||||
 | 
			
		||||
    async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
 | 
			
		||||
        dl = await self.download(element_path, mtime=element.mtime)
 | 
			
		||||
        if not dl:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        async with self.download_bar(element_path) as bar:
 | 
			
		||||
            async with dl as sink, self.session.get(element.url) as resp:
 | 
			
		||||
        async with self.download_bar(element_path) as bar, dl as sink:
 | 
			
		||||
            await self._stream_from_url(element.url, sink, bar)
 | 
			
		||||
 | 
			
		||||
    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
 | 
			
		||||
        async def try_stream() -> bool:
 | 
			
		||||
            async with self.session.get(url, allow_redirects=False) as resp:
 | 
			
		||||
                # Redirect means we weren't authenticated
 | 
			
		||||
                if hdrs.LOCATION in resp.headers:
 | 
			
		||||
                    return False
 | 
			
		||||
 | 
			
		||||
                if resp.content_length:
 | 
			
		||||
                    bar.set_total(resp.content_length)
 | 
			
		||||
 | 
			
		||||
@@ -257,22 +259,39 @@ class KitIliasWebCrawler(HttpCrawler):
 | 
			
		||||
                    bar.advance(len(data))
 | 
			
		||||
 | 
			
		||||
                sink.done()
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
    async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
 | 
			
		||||
        # This function will retry itself a few times if it is not logged in - it won't handle
 | 
			
		||||
        # connection errors
 | 
			
		||||
        if retries_left < 0:
 | 
			
		||||
            # TODO: Proper exception
 | 
			
		||||
            raise RuntimeError("Get page failed too often")
 | 
			
		||||
        print(url, "retries left", retries_left)
 | 
			
		||||
        auth_id = await self.prepare_request()
 | 
			
		||||
        if await try_stream():
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        await self.authenticate(auth_id)
 | 
			
		||||
 | 
			
		||||
        if not await try_stream():
 | 
			
		||||
            raise CrawlError("File streaming failed after authenticate()")
 | 
			
		||||
 | 
			
		||||
    async def _get_page(self, url: str) -> BeautifulSoup:
 | 
			
		||||
        auth_id = await self.prepare_request()
 | 
			
		||||
        async with self.session.get(url) as request:
 | 
			
		||||
            soup = soupify(await request.read())
 | 
			
		||||
            if self._is_logged_in(soup):
 | 
			
		||||
                return soup
 | 
			
		||||
 | 
			
		||||
        await self._shibboleth_login.login(self.session)
 | 
			
		||||
        # We weren't authenticated, so try to do that
 | 
			
		||||
        await self.authenticate(auth_id)
 | 
			
		||||
 | 
			
		||||
        return await self._get_page(url, retries_left - 1)
 | 
			
		||||
        # Retry once after authenticating. If this fails, we will die.
 | 
			
		||||
        async with self.session.get(url) as request:
 | 
			
		||||
            soup = soupify(await request.read())
 | 
			
		||||
            if self._is_logged_in(soup):
 | 
			
		||||
                return soup
 | 
			
		||||
        raise CrawlError("get_page failed even after authenticating")
 | 
			
		||||
 | 
			
		||||
    # We repeat this as the login method in shibboleth doesn't handle I/O errors.
 | 
			
		||||
    # Shibboleth is quite reliable as well, the repeat is likely not critical here.
 | 
			
		||||
    @_iorepeat(3, "Login")
 | 
			
		||||
    async def _authenticate(self) -> None:
 | 
			
		||||
        await self._shibboleth_login.login(self.session)
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _is_logged_in(soup: BeautifulSoup) -> bool:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user