Add basic auth to KIT-IPD crawler

2025-11-03 14:12:45 +01:00 · 2025-10-29 13:02:18 +01:00
parent bd7b384e8f
commit 3453bbc991
5 changed files with 39 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ ambiguous situations.
 ## Added
 - Store the description when using the `internet-shortcut` link format
 - Support for basic auth with the kit-ipd crawler
 ## Fixed
 - Event loop errors on Windows with Python 3.14
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -153,6 +153,7 @@ requests is likely a good idea.
 - `link_regex`: A regex that is matched against the `href` part of links. If it
  matches, the given link is downloaded as a file. This is used to extract
  files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
 - `auth`: Name of auth section to use for basic authentication. (Optional)
 ### The `ilias-web` crawler
--- a/PFERD/cli/command_kit_ipd.py
+++ b/PFERD/cli/command_kit_ipd.py
@@ -20,6 +20,11 @@ GROUP.add_argument(
    metavar="REGEX",
    help="href-matching regex to identify downloadable files",
 )
 GROUP.add_argument(
    "--basic-auth",
    action="store_true",
    help="enable basic authentication",
 )
 GROUP.add_argument(
    "target",
    type=str,
@@ -50,5 +55,11 @@ def load(
    if args.link_regex:
        section["link_regex"] = str(args.link_regex)
    if args.basic_auth:
        section["auth"] = "auth:kit-ipd"
        parser["auth:kit-ipd"] = {}
        auth_section = parser["auth:kit-ipd"]
        auth_section["type"] = "simple"
 SUBPARSER.set_defaults(command=load)
--- a/PFERD/crawl/init.py
+++ b/PFERD/crawl/init.py
@@ -22,5 +22,5 @@ CRAWLERS: dict[str, CrawlerConstructor] = {
    "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
    "ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
    "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
-    "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
+    "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c, a),
 }
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -8,8 +8,10 @@ from re import Pattern
 from typing import Any, Optional, Union, cast
 from urllib.parse import urljoin
 import aiohttp
 from bs4 import BeautifulSoup, Tag
 from ..auth import Authenticator
 from ..config import Config
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink
@@ -33,6 +35,15 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
        regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
        return re.compile(regex)
    def basic_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
        value: Optional[str] = self.s.get("auth")
        if value is None:
            return None
        auth = authenticators.get(value)
        if auth is None:
            self.invalid_value("auth", value, "No such auth section exists")
        return auth
@dataclass
 class KitIpdFile:
@@ -60,12 +71,19 @@ class KitIpdCrawler(HttpCrawler):
        name: str,
        section: KitIpdCrawlerSection,
        config: Config,
        authenticators: dict[str, Authenticator],
    ):
        super().__init__(name, section, config)
        self._url = section.target()
        self._file_regex = section.link_regex()
        self._authenticator = section.basic_auth(authenticators)
        self._basic_auth: Optional[aiohttp.BasicAuth] = None
    async def _run(self) -> None:
        if self._authenticator:
            username, password = await self._authenticator.credentials()
            self._basic_auth = aiohttp.BasicAuth(username, password)
        maybe_cl = await self.crawl(PurePath("."))
        if not maybe_cl:
            return
@@ -160,9 +178,14 @@ class KitIpdCrawler(HttpCrawler):
        return urljoin(url, cast(str, link_tag.get("href")))
    async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
-        async with self.session.get(url, allow_redirects=False) as resp:
+        async with self.session.get(url, allow_redirects=False, auth=self._basic_auth) as resp:
            if resp.status == 403:
                raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
            if resp.status == 401:
                raise CrawlError("Received a 401. Do you maybe need credentials?")
            if resp.status >= 400:
                raise CrawlError(f"Received HTTP {resp.status} when trying to download {url!r}")
            if resp.content_length:
                bar.set_total(resp.content_length)
@@ -175,7 +198,7 @@ class KitIpdCrawler(HttpCrawler):
            self._add_etag_to_report(path, resp.headers.get("ETag"))
    async def get_page(self) -> tuple[BeautifulSoup, str]:
-        async with self.session.get(self._url) as request:
+        async with self.session.get(self._url, auth=self._basic_auth) as request:
            # The web page for Algorithmen für Routenplanung contains some
            # weird comments that beautifulsoup doesn't parse correctly. This
            # hack enables those pages to be crawled, and should hopefully not