From 3453bbc99135f2c7af236f82c40f304ad1ab6148 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 29 Oct 2025 13:02:18 +0100 Subject: [PATCH] Add basic auth to KIT-IPD crawler --- CHANGELOG.md | 1 + CONFIG.md | 1 + PFERD/cli/command_kit_ipd.py | 11 +++++++++++ PFERD/crawl/__init__.py | 2 +- PFERD/crawl/kit_ipd_crawler.py | 27 +++++++++++++++++++++++++-- 5 files changed, 39 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fef0e1..729299e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ## Added - Store the description when using the `internet-shortcut` link format +- Support for basic auth with the kit-ipd crawler ## Fixed - Event loop errors on Windows with Python 3.14 diff --git a/CONFIG.md b/CONFIG.md index 4bf082f..b87f75c 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -153,6 +153,7 @@ requests is likely a good idea. - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) +- `auth`: Name of auth section to use for basic authentication. (Optional) ### The `ilias-web` crawler diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py index 589d9a3..a80af03 100644 --- a/PFERD/cli/command_kit_ipd.py +++ b/PFERD/cli/command_kit_ipd.py @@ -20,6 +20,11 @@ GROUP.add_argument( metavar="REGEX", help="href-matching regex to identify downloadable files", ) +GROUP.add_argument( + "--basic-auth", + action="store_true", + help="enable basic authentication", +) GROUP.add_argument( "target", type=str, @@ -50,5 +55,11 @@ def load( if args.link_regex: section["link_regex"] = str(args.link_regex) + if args.basic_auth: + section["auth"] = "auth:kit-ipd" + parser["auth:kit-ipd"] = {} + auth_section = parser["auth:kit-ipd"] + auth_section["type"] = "simple" + SUBPARSER.set_defaults(command=load) diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 6032c97..9ba6a37 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -22,5 +22,5 @@ CRAWLERS: dict[str, CrawlerConstructor] = { "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c), "ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a), "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), - "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c), + "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c, a), } diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 165a661..4dad8f0 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -8,8 +8,10 @@ from re import Pattern from typing import Any, Optional, Union, cast from urllib.parse import urljoin +import aiohttp from bs4 import BeautifulSoup, Tag +from ..auth import Authenticator from ..config import Config from ..logging import ProgressBar, log from ..output_dir import FileSink @@ -33,6 +35,15 @@ class KitIpdCrawlerSection(HttpCrawlerSection): regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") return re.compile(regex) + def basic_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]: + value: Optional[str] = self.s.get("auth") + if value is None: + return None + auth = authenticators.get(value) + if auth is None: + self.invalid_value("auth", value, "No such auth section exists") + return auth + @dataclass class KitIpdFile: @@ -60,12 +71,19 @@ class KitIpdCrawler(HttpCrawler): name: str, section: KitIpdCrawlerSection, config: Config, + authenticators: dict[str, Authenticator], ): super().__init__(name, section, config) self._url = section.target() self._file_regex = section.link_regex() + self._authenticator = section.basic_auth(authenticators) + self._basic_auth: Optional[aiohttp.BasicAuth] = None async def _run(self) -> None: + if self._authenticator: + username, password = await self._authenticator.credentials() + self._basic_auth = aiohttp.BasicAuth(username, password) + maybe_cl = await self.crawl(PurePath(".")) if not maybe_cl: return @@ -160,9 +178,14 @@ class KitIpdCrawler(HttpCrawler): return urljoin(url, cast(str, link_tag.get("href"))) async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: - async with self.session.get(url, allow_redirects=False) as resp: + async with self.session.get(url, allow_redirects=False, auth=self._basic_auth) as resp: if resp.status == 403: raise CrawlError("Received a 403. Are you within the KIT network/VPN?") + if resp.status == 401: + raise CrawlError("Received a 401. Do you maybe need credentials?") + if resp.status >= 400: + raise CrawlError(f"Received HTTP {resp.status} when trying to download {url!r}") + if resp.content_length: bar.set_total(resp.content_length) @@ -175,7 +198,7 @@ class KitIpdCrawler(HttpCrawler): self._add_etag_to_report(path, resp.headers.get("ETag")) async def get_page(self) -> tuple[BeautifulSoup, str]: - async with self.session.get(self._url) as request: + async with self.session.get(self._url, auth=self._basic_auth) as request: # The web page for Algorithmen für Routenplanung contains some # weird comments that beautifulsoup doesn't parse correctly. This # hack enables those pages to be crawled, and should hopefully not