From 3453bbc99135f2c7af236f82c40f304ad1ab6148 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Wed, 29 Oct 2025 13:02:18 +0100
Subject: [PATCH] Add basic auth to KIT-IPD crawler

---
 CHANGELOG.md                   |  1 +
 CONFIG.md                      |  1 +
 PFERD/cli/command_kit_ipd.py   | 11 +++++++++++
 PFERD/crawl/__init__.py        |  2 +-
 PFERD/crawl/kit_ipd_crawler.py | 27 +++++++++++++++++++++++++--
 5 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fef0e1..729299e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ ambiguous situations.
 
 ## Added
 - Store the description when using the `internet-shortcut` link format
+- Support for basic auth with the kit-ipd crawler
 
 ## Fixed
 - Event loop errors on Windows with Python 3.14
diff --git a/CONFIG.md b/CONFIG.md
index 4bf082f..b87f75c 100644
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -153,6 +153,7 @@ requests is likely a good idea.
 - `link_regex`: A regex that is matched against the `href` part of links. If it
   matches, the given link is downloaded as a file. This is used to extract
   files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
+- `auth`: Name of auth section to use for basic authentication. (Optional)
 
 ### The `ilias-web` crawler
 
diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py
index 589d9a3..a80af03 100644
--- a/PFERD/cli/command_kit_ipd.py
+++ b/PFERD/cli/command_kit_ipd.py
@@ -20,6 +20,11 @@ GROUP.add_argument(
     metavar="REGEX",
     help="href-matching regex to identify downloadable files",
 )
+GROUP.add_argument(
+    "--basic-auth",
+    action="store_true",
+    help="enable basic authentication",
+)
 GROUP.add_argument(
     "target",
     type=str,
@@ -50,5 +55,11 @@ def load(
     if args.link_regex:
         section["link_regex"] = str(args.link_regex)
 
+    if args.basic_auth:
+        section["auth"] = "auth:kit-ipd"
+        parser["auth:kit-ipd"] = {}
+        auth_section = parser["auth:kit-ipd"]
+        auth_section["type"] = "simple"
+
 
 SUBPARSER.set_defaults(command=load)
diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py
index 6032c97..9ba6a37 100644
--- a/PFERD/crawl/__init__.py
+++ b/PFERD/crawl/__init__.py
@@ -22,5 +22,5 @@ CRAWLERS: dict[str, CrawlerConstructor] = {
     "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
     "ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
     "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
-    "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
+    "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c, a),
 }
diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py
index 165a661..4dad8f0 100644
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -8,8 +8,10 @@ from re import Pattern
 from typing import Any, Optional, Union, cast
 from urllib.parse import urljoin
 
+import aiohttp
 from bs4 import BeautifulSoup, Tag
 
+from ..auth import Authenticator
 from ..config import Config
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink
@@ -33,6 +35,15 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
         regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
         return re.compile(regex)
 
+    def basic_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
+        value: Optional[str] = self.s.get("auth")
+        if value is None:
+            return None
+        auth = authenticators.get(value)
+        if auth is None:
+            self.invalid_value("auth", value, "No such auth section exists")
+        return auth
+
 
 @dataclass
 class KitIpdFile:
@@ -60,12 +71,19 @@ class KitIpdCrawler(HttpCrawler):
         name: str,
         section: KitIpdCrawlerSection,
         config: Config,
+        authenticators: dict[str, Authenticator],
     ):
         super().__init__(name, section, config)
         self._url = section.target()
         self._file_regex = section.link_regex()
+        self._authenticator = section.basic_auth(authenticators)
+        self._basic_auth: Optional[aiohttp.BasicAuth] = None
 
     async def _run(self) -> None:
+        if self._authenticator:
+            username, password = await self._authenticator.credentials()
+            self._basic_auth = aiohttp.BasicAuth(username, password)
+
         maybe_cl = await self.crawl(PurePath("."))
         if not maybe_cl:
             return
@@ -160,9 +178,14 @@ class KitIpdCrawler(HttpCrawler):
         return urljoin(url, cast(str, link_tag.get("href")))
 
     async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
-        async with self.session.get(url, allow_redirects=False) as resp:
+        async with self.session.get(url, allow_redirects=False, auth=self._basic_auth) as resp:
             if resp.status == 403:
                 raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
+            if resp.status == 401:
+                raise CrawlError("Received a 401. Do you maybe need credentials?")
+            if resp.status >= 400:
+                raise CrawlError(f"Received HTTP {resp.status} when trying to download {url!r}")
+
             if resp.content_length:
                 bar.set_total(resp.content_length)
 
@@ -175,7 +198,7 @@ class KitIpdCrawler(HttpCrawler):
             self._add_etag_to_report(path, resp.headers.get("ETag"))
 
     async def get_page(self) -> tuple[BeautifulSoup, str]:
-        async with self.session.get(self._url) as request:
+        async with self.session.get(self._url, auth=self._basic_auth) as request:
             # The web page for Algorithmen für Routenplanung contains some
             # weird comments that beautifulsoup doesn't parse correctly. This
             # hack enables those pages to be crawled, and should hopefully not