Try out some HTTP authentication handling

This is by no means final yet and will change a bit once the dl and cl are changed, but it might serve as a first try. It is also wholly untested.
2025-08-17 04:52:26 +02:00 · 2021-05-21 12:02:51 +02:00
parent 83d12fcf2d
commit 4b104b6252
2 changed files with 69 additions and 22 deletions
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -5,14 +5,15 @@ from pathlib import PurePath
 from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union

 import aiohttp
+from aiohttp import hdrs
 from bs4 import BeautifulSoup, Tag
 from rich.markup import escape

 from PFERD.authenticators import Authenticator
 from PFERD.config import Config
 from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
-from PFERD.logging import log
-from PFERD.output_dir import Redownload
+from PFERD.logging import ProgressBar, log
+from PFERD.output_dir import FileSink, Redownload
 from PFERD.utils import soupify, url_set_query_param

 from .file_templates import link_template_plain, link_template_rich
@@ -232,23 +233,24 @@ class KitIliasWebCrawler(HttpCrawler):
            page = IliasPage(await self._get_page(element.url), element.url, element)
            real_element = page.get_child_elements()[0]

-            async with dl as sink, self.session.get(real_element.url) as resp:
-                if resp.content_length:
-                    bar.set_total(resp.content_length)
-
-                async for data in resp.content.iter_chunked(1024):
-                    sink.file.write(data)
-                    bar.advance(len(data))
-
-                sink.done()
+            async with dl as sink:
+                await self._stream_from_url(real_element.url, sink, bar)

    async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
        dl = await self.download(element_path, mtime=element.mtime)
        if not dl:
            return

-        async with self.download_bar(element_path) as bar:
-            async with dl as sink, self.session.get(element.url) as resp:
+        async with self.download_bar(element_path) as bar, dl as sink:
+            await self._stream_from_url(element.url, sink, bar)
+
+    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
+        async def try_stream() -> bool:
+            async with self.session.get(url, allow_redirects=False) as resp:
+                # Redirect means we weren't authenticated
+                if hdrs.LOCATION in resp.headers:
+                    return False
+
                if resp.content_length:
                    bar.set_total(resp.content_length)

@@ -257,22 +259,39 @@ class KitIliasWebCrawler(HttpCrawler):
                    bar.advance(len(data))

                sink.done()
+            return True

-    async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
-        # This function will retry itself a few times if it is not logged in - it won't handle
-        # connection errors
-        if retries_left < 0:
-            # TODO: Proper exception
-            raise RuntimeError("Get page failed too often")
-        print(url, "retries left", retries_left)
+        auth_id = await self.prepare_request()
+        if await try_stream():
+            return
+
+        await self.authenticate(auth_id)
+
+        if not await try_stream():
+            raise CrawlError("File streaming failed after authenticate()")
+
+    async def _get_page(self, url: str) -> BeautifulSoup:
+        auth_id = await self.prepare_request()
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
            if self._is_logged_in(soup):
                return soup

-        await self._shibboleth_login.login(self.session)
+        # We weren't authenticated, so try to do that
+        await self.authenticate(auth_id)

-        return await self._get_page(url, retries_left - 1)
+        # Retry once after authenticating. If this fails, we will die.
+        async with self.session.get(url) as request:
+            soup = soupify(await request.read())
+            if self._is_logged_in(soup):
+                return soup
+        raise CrawlError("get_page failed even after authenticating")
+
+    # We repeat this as the login method in shibboleth doesn't handle I/O errors.
+    # Shibboleth is quite reliable as well, the repeat is likely not critical here.
+    @_iorepeat(3, "Login")
+    async def _authenticate(self) -> None:
+        await self._shibboleth_login.login(self.session)

    @staticmethod
    def _is_logged_in(soup: BeautifulSoup) -> bool: