Add basic auth to KIT-IPD crawler

This commit is contained in:
I-Al-Istannen
2025-10-29 13:02:18 +01:00
parent bd7b384e8f
commit 3453bbc991
5 changed files with 39 additions and 3 deletions

View File

@@ -24,6 +24,7 @@ ambiguous situations.
## Added ## Added
- Store the description when using the `internet-shortcut` link format - Store the description when using the `internet-shortcut` link format
- Support for basic auth with the kit-ipd crawler
## Fixed ## Fixed
- Event loop errors on Windows with Python 3.14 - Event loop errors on Windows with Python 3.14

View File

@@ -153,6 +153,7 @@ requests is likely a good idea.
- `link_regex`: A regex that is matched against the `href` part of links. If it - `link_regex`: A regex that is matched against the `href` part of links. If it
matches, the given link is downloaded as a file. This is used to extract matches, the given link is downloaded as a file. This is used to extract
files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
- `auth`: Name of auth section to use for basic authentication. (Optional)
### The `ilias-web` crawler ### The `ilias-web` crawler

View File

@@ -20,6 +20,11 @@ GROUP.add_argument(
metavar="REGEX", metavar="REGEX",
help="href-matching regex to identify downloadable files", help="href-matching regex to identify downloadable files",
) )
GROUP.add_argument(
"--basic-auth",
action="store_true",
help="enable basic authentication",
)
GROUP.add_argument( GROUP.add_argument(
"target", "target",
type=str, type=str,
@@ -50,5 +55,11 @@ def load(
if args.link_regex: if args.link_regex:
section["link_regex"] = str(args.link_regex) section["link_regex"] = str(args.link_regex)
if args.basic_auth:
section["auth"] = "auth:kit-ipd"
parser["auth:kit-ipd"] = {}
auth_section = parser["auth:kit-ipd"]
auth_section["type"] = "simple"
SUBPARSER.set_defaults(command=load) SUBPARSER.set_defaults(command=load)

View File

@@ -22,5 +22,5 @@ CRAWLERS: dict[str, CrawlerConstructor] = {
"local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c), "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
"ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a), "ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
"kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
"kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c), "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c, a),
} }

View File

@@ -8,8 +8,10 @@ from re import Pattern
from typing import Any, Optional, Union, cast from typing import Any, Optional, Union, cast
from urllib.parse import urljoin from urllib.parse import urljoin
import aiohttp
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from ..auth import Authenticator
from ..config import Config from ..config import Config
from ..logging import ProgressBar, log from ..logging import ProgressBar, log
from ..output_dir import FileSink from ..output_dir import FileSink
@@ -33,6 +35,15 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
return re.compile(regex) return re.compile(regex)
def basic_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
value: Optional[str] = self.s.get("auth")
if value is None:
return None
auth = authenticators.get(value)
if auth is None:
self.invalid_value("auth", value, "No such auth section exists")
return auth
@dataclass @dataclass
class KitIpdFile: class KitIpdFile:
@@ -60,12 +71,19 @@ class KitIpdCrawler(HttpCrawler):
name: str, name: str,
section: KitIpdCrawlerSection, section: KitIpdCrawlerSection,
config: Config, config: Config,
authenticators: dict[str, Authenticator],
): ):
super().__init__(name, section, config) super().__init__(name, section, config)
self._url = section.target() self._url = section.target()
self._file_regex = section.link_regex() self._file_regex = section.link_regex()
self._authenticator = section.basic_auth(authenticators)
self._basic_auth: Optional[aiohttp.BasicAuth] = None
async def _run(self) -> None: async def _run(self) -> None:
if self._authenticator:
username, password = await self._authenticator.credentials()
self._basic_auth = aiohttp.BasicAuth(username, password)
maybe_cl = await self.crawl(PurePath(".")) maybe_cl = await self.crawl(PurePath("."))
if not maybe_cl: if not maybe_cl:
return return
@@ -160,9 +178,14 @@ class KitIpdCrawler(HttpCrawler):
return urljoin(url, cast(str, link_tag.get("href"))) return urljoin(url, cast(str, link_tag.get("href")))
async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp: async with self.session.get(url, allow_redirects=False, auth=self._basic_auth) as resp:
if resp.status == 403: if resp.status == 403:
raise CrawlError("Received a 403. Are you within the KIT network/VPN?") raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
if resp.status == 401:
raise CrawlError("Received a 401. Do you maybe need credentials?")
if resp.status >= 400:
raise CrawlError(f"Received HTTP {resp.status} when trying to download {url!r}")
if resp.content_length: if resp.content_length:
bar.set_total(resp.content_length) bar.set_total(resp.content_length)
@@ -175,7 +198,7 @@ class KitIpdCrawler(HttpCrawler):
self._add_etag_to_report(path, resp.headers.get("ETag")) self._add_etag_to_report(path, resp.headers.get("ETag"))
async def get_page(self) -> tuple[BeautifulSoup, str]: async def get_page(self) -> tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request: async with self.session.get(self._url, auth=self._basic_auth) as request:
# The web page for Algorithmen für Routenplanung contains some # The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This # weird comments that beautifulsoup doesn't parse correctly. This
# hack enables those pages to be crawled, and should hopefully not # hack enables those pages to be crawled, and should hopefully not