mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-11-03 14:12:45 +01:00
Add basic auth to KIT-IPD crawler
This commit is contained in:
@@ -24,6 +24,7 @@ ambiguous situations.
|
|||||||
|
|
||||||
## Added
|
## Added
|
||||||
- Store the description when using the `internet-shortcut` link format
|
- Store the description when using the `internet-shortcut` link format
|
||||||
|
- Support for basic auth with the kit-ipd crawler
|
||||||
|
|
||||||
## Fixed
|
## Fixed
|
||||||
- Event loop errors on Windows with Python 3.14
|
- Event loop errors on Windows with Python 3.14
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ requests is likely a good idea.
|
|||||||
- `link_regex`: A regex that is matched against the `href` part of links. If it
|
- `link_regex`: A regex that is matched against the `href` part of links. If it
|
||||||
matches, the given link is downloaded as a file. This is used to extract
|
matches, the given link is downloaded as a file. This is used to extract
|
||||||
files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
|
files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
|
||||||
|
- `auth`: Name of auth section to use for basic authentication. (Optional)
|
||||||
|
|
||||||
### The `ilias-web` crawler
|
### The `ilias-web` crawler
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,11 @@ GROUP.add_argument(
|
|||||||
metavar="REGEX",
|
metavar="REGEX",
|
||||||
help="href-matching regex to identify downloadable files",
|
help="href-matching regex to identify downloadable files",
|
||||||
)
|
)
|
||||||
|
GROUP.add_argument(
|
||||||
|
"--basic-auth",
|
||||||
|
action="store_true",
|
||||||
|
help="enable basic authentication",
|
||||||
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"target",
|
"target",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -50,5 +55,11 @@ def load(
|
|||||||
if args.link_regex:
|
if args.link_regex:
|
||||||
section["link_regex"] = str(args.link_regex)
|
section["link_regex"] = str(args.link_regex)
|
||||||
|
|
||||||
|
if args.basic_auth:
|
||||||
|
section["auth"] = "auth:kit-ipd"
|
||||||
|
parser["auth:kit-ipd"] = {}
|
||||||
|
auth_section = parser["auth:kit-ipd"]
|
||||||
|
auth_section["type"] = "simple"
|
||||||
|
|
||||||
|
|
||||||
SUBPARSER.set_defaults(command=load)
|
SUBPARSER.set_defaults(command=load)
|
||||||
|
|||||||
@@ -22,5 +22,5 @@ CRAWLERS: dict[str, CrawlerConstructor] = {
|
|||||||
"local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
|
"local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
|
||||||
"ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
|
"ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
|
||||||
"kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
|
"kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
|
||||||
"kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
|
"kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c, a),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,8 +8,10 @@ from re import Pattern
|
|||||||
from typing import Any, Optional, Union, cast
|
from typing import Any, Optional, Union, cast
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from ..auth import Authenticator
|
||||||
from ..config import Config
|
from ..config import Config
|
||||||
from ..logging import ProgressBar, log
|
from ..logging import ProgressBar, log
|
||||||
from ..output_dir import FileSink
|
from ..output_dir import FileSink
|
||||||
@@ -33,6 +35,15 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
|
|||||||
regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
|
regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
|
||||||
return re.compile(regex)
|
return re.compile(regex)
|
||||||
|
|
||||||
|
def basic_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||||
|
value: Optional[str] = self.s.get("auth")
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
auth = authenticators.get(value)
|
||||||
|
if auth is None:
|
||||||
|
self.invalid_value("auth", value, "No such auth section exists")
|
||||||
|
return auth
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class KitIpdFile:
|
class KitIpdFile:
|
||||||
@@ -60,12 +71,19 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
name: str,
|
name: str,
|
||||||
section: KitIpdCrawlerSection,
|
section: KitIpdCrawlerSection,
|
||||||
config: Config,
|
config: Config,
|
||||||
|
authenticators: dict[str, Authenticator],
|
||||||
):
|
):
|
||||||
super().__init__(name, section, config)
|
super().__init__(name, section, config)
|
||||||
self._url = section.target()
|
self._url = section.target()
|
||||||
self._file_regex = section.link_regex()
|
self._file_regex = section.link_regex()
|
||||||
|
self._authenticator = section.basic_auth(authenticators)
|
||||||
|
self._basic_auth: Optional[aiohttp.BasicAuth] = None
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
|
if self._authenticator:
|
||||||
|
username, password = await self._authenticator.credentials()
|
||||||
|
self._basic_auth = aiohttp.BasicAuth(username, password)
|
||||||
|
|
||||||
maybe_cl = await self.crawl(PurePath("."))
|
maybe_cl = await self.crawl(PurePath("."))
|
||||||
if not maybe_cl:
|
if not maybe_cl:
|
||||||
return
|
return
|
||||||
@@ -160,9 +178,14 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
return urljoin(url, cast(str, link_tag.get("href")))
|
return urljoin(url, cast(str, link_tag.get("href")))
|
||||||
|
|
||||||
async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
|
async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
async with self.session.get(url, allow_redirects=False) as resp:
|
async with self.session.get(url, allow_redirects=False, auth=self._basic_auth) as resp:
|
||||||
if resp.status == 403:
|
if resp.status == 403:
|
||||||
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
|
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
|
||||||
|
if resp.status == 401:
|
||||||
|
raise CrawlError("Received a 401. Do you maybe need credentials?")
|
||||||
|
if resp.status >= 400:
|
||||||
|
raise CrawlError(f"Received HTTP {resp.status} when trying to download {url!r}")
|
||||||
|
|
||||||
if resp.content_length:
|
if resp.content_length:
|
||||||
bar.set_total(resp.content_length)
|
bar.set_total(resp.content_length)
|
||||||
|
|
||||||
@@ -175,7 +198,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
self._add_etag_to_report(path, resp.headers.get("ETag"))
|
self._add_etag_to_report(path, resp.headers.get("ETag"))
|
||||||
|
|
||||||
async def get_page(self) -> tuple[BeautifulSoup, str]:
|
async def get_page(self) -> tuple[BeautifulSoup, str]:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url, auth=self._basic_auth) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
# hack enables those pages to be crawled, and should hopefully not
|
# hack enables those pages to be crawled, and should hopefully not
|
||||||
|
|||||||
Reference in New Issue
Block a user