mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-03 22:23:41 +01:00 
			
		
		
		
	Add basic auth to KIT-IPD crawler
This commit is contained in:
		@@ -24,6 +24,7 @@ ambiguous situations.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
## Added
 | 
					## Added
 | 
				
			||||||
- Store the description when using the `internet-shortcut` link format
 | 
					- Store the description when using the `internet-shortcut` link format
 | 
				
			||||||
 | 
					- Support for basic auth with the kit-ipd crawler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Fixed
 | 
					## Fixed
 | 
				
			||||||
- Event loop errors on Windows with Python 3.14
 | 
					- Event loop errors on Windows with Python 3.14
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -153,6 +153,7 @@ requests is likely a good idea.
 | 
				
			|||||||
- `link_regex`: A regex that is matched against the `href` part of links. If it
 | 
					- `link_regex`: A regex that is matched against the `href` part of links. If it
 | 
				
			||||||
  matches, the given link is downloaded as a file. This is used to extract
 | 
					  matches, the given link is downloaded as a file. This is used to extract
 | 
				
			||||||
  files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
 | 
					  files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
 | 
				
			||||||
 | 
					- `auth`: Name of auth section to use for basic authentication. (Optional)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### The `ilias-web` crawler
 | 
					### The `ilias-web` crawler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -20,6 +20,11 @@ GROUP.add_argument(
 | 
				
			|||||||
    metavar="REGEX",
 | 
					    metavar="REGEX",
 | 
				
			||||||
    help="href-matching regex to identify downloadable files",
 | 
					    help="href-matching regex to identify downloadable files",
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					GROUP.add_argument(
 | 
				
			||||||
 | 
					    "--basic-auth",
 | 
				
			||||||
 | 
					    action="store_true",
 | 
				
			||||||
 | 
					    help="enable basic authentication",
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
GROUP.add_argument(
 | 
					GROUP.add_argument(
 | 
				
			||||||
    "target",
 | 
					    "target",
 | 
				
			||||||
    type=str,
 | 
					    type=str,
 | 
				
			||||||
@@ -50,5 +55,11 @@ def load(
 | 
				
			|||||||
    if args.link_regex:
 | 
					    if args.link_regex:
 | 
				
			||||||
        section["link_regex"] = str(args.link_regex)
 | 
					        section["link_regex"] = str(args.link_regex)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.basic_auth:
 | 
				
			||||||
 | 
					        section["auth"] = "auth:kit-ipd"
 | 
				
			||||||
 | 
					        parser["auth:kit-ipd"] = {}
 | 
				
			||||||
 | 
					        auth_section = parser["auth:kit-ipd"]
 | 
				
			||||||
 | 
					        auth_section["type"] = "simple"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SUBPARSER.set_defaults(command=load)
 | 
					SUBPARSER.set_defaults(command=load)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -22,5 +22,5 @@ CRAWLERS: dict[str, CrawlerConstructor] = {
 | 
				
			|||||||
    "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
 | 
					    "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
 | 
				
			||||||
    "ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
 | 
					    "ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
 | 
				
			||||||
    "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
 | 
					    "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
 | 
				
			||||||
    "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
 | 
					    "kit-ipd": lambda n, s, c, a: KitIpdCrawler(n, KitIpdCrawlerSection(s), c, a),
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,8 +8,10 @@ from re import Pattern
 | 
				
			|||||||
from typing import Any, Optional, Union, cast
 | 
					from typing import Any, Optional, Union, cast
 | 
				
			||||||
from urllib.parse import urljoin
 | 
					from urllib.parse import urljoin
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import aiohttp
 | 
				
			||||||
from bs4 import BeautifulSoup, Tag
 | 
					from bs4 import BeautifulSoup, Tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..auth import Authenticator
 | 
				
			||||||
from ..config import Config
 | 
					from ..config import Config
 | 
				
			||||||
from ..logging import ProgressBar, log
 | 
					from ..logging import ProgressBar, log
 | 
				
			||||||
from ..output_dir import FileSink
 | 
					from ..output_dir import FileSink
 | 
				
			||||||
@@ -33,6 +35,15 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
 | 
				
			|||||||
        regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
 | 
					        regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
 | 
				
			||||||
        return re.compile(regex)
 | 
					        return re.compile(regex)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def basic_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
 | 
				
			||||||
 | 
					        value: Optional[str] = self.s.get("auth")
 | 
				
			||||||
 | 
					        if value is None:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        auth = authenticators.get(value)
 | 
				
			||||||
 | 
					        if auth is None:
 | 
				
			||||||
 | 
					            self.invalid_value("auth", value, "No such auth section exists")
 | 
				
			||||||
 | 
					        return auth
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@dataclass
 | 
					@dataclass
 | 
				
			||||||
class KitIpdFile:
 | 
					class KitIpdFile:
 | 
				
			||||||
@@ -60,12 +71,19 @@ class KitIpdCrawler(HttpCrawler):
 | 
				
			|||||||
        name: str,
 | 
					        name: str,
 | 
				
			||||||
        section: KitIpdCrawlerSection,
 | 
					        section: KitIpdCrawlerSection,
 | 
				
			||||||
        config: Config,
 | 
					        config: Config,
 | 
				
			||||||
 | 
					        authenticators: dict[str, Authenticator],
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        super().__init__(name, section, config)
 | 
					        super().__init__(name, section, config)
 | 
				
			||||||
        self._url = section.target()
 | 
					        self._url = section.target()
 | 
				
			||||||
        self._file_regex = section.link_regex()
 | 
					        self._file_regex = section.link_regex()
 | 
				
			||||||
 | 
					        self._authenticator = section.basic_auth(authenticators)
 | 
				
			||||||
 | 
					        self._basic_auth: Optional[aiohttp.BasicAuth] = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def _run(self) -> None:
 | 
					    async def _run(self) -> None:
 | 
				
			||||||
 | 
					        if self._authenticator:
 | 
				
			||||||
 | 
					            username, password = await self._authenticator.credentials()
 | 
				
			||||||
 | 
					            self._basic_auth = aiohttp.BasicAuth(username, password)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        maybe_cl = await self.crawl(PurePath("."))
 | 
					        maybe_cl = await self.crawl(PurePath("."))
 | 
				
			||||||
        if not maybe_cl:
 | 
					        if not maybe_cl:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
@@ -160,9 +178,14 @@ class KitIpdCrawler(HttpCrawler):
 | 
				
			|||||||
        return urljoin(url, cast(str, link_tag.get("href")))
 | 
					        return urljoin(url, cast(str, link_tag.get("href")))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
 | 
					    async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
 | 
				
			||||||
        async with self.session.get(url, allow_redirects=False) as resp:
 | 
					        async with self.session.get(url, allow_redirects=False, auth=self._basic_auth) as resp:
 | 
				
			||||||
            if resp.status == 403:
 | 
					            if resp.status == 403:
 | 
				
			||||||
                raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
 | 
					                raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
 | 
				
			||||||
 | 
					            if resp.status == 401:
 | 
				
			||||||
 | 
					                raise CrawlError("Received a 401. Do you maybe need credentials?")
 | 
				
			||||||
 | 
					            if resp.status >= 400:
 | 
				
			||||||
 | 
					                raise CrawlError(f"Received HTTP {resp.status} when trying to download {url!r}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if resp.content_length:
 | 
					            if resp.content_length:
 | 
				
			||||||
                bar.set_total(resp.content_length)
 | 
					                bar.set_total(resp.content_length)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -175,7 +198,7 @@ class KitIpdCrawler(HttpCrawler):
 | 
				
			|||||||
            self._add_etag_to_report(path, resp.headers.get("ETag"))
 | 
					            self._add_etag_to_report(path, resp.headers.get("ETag"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def get_page(self) -> tuple[BeautifulSoup, str]:
 | 
					    async def get_page(self) -> tuple[BeautifulSoup, str]:
 | 
				
			||||||
        async with self.session.get(self._url) as request:
 | 
					        async with self.session.get(self._url, auth=self._basic_auth) as request:
 | 
				
			||||||
            # The web page for Algorithmen für Routenplanung contains some
 | 
					            # The web page for Algorithmen für Routenplanung contains some
 | 
				
			||||||
            # weird comments that beautifulsoup doesn't parse correctly. This
 | 
					            # weird comments that beautifulsoup doesn't parse correctly. This
 | 
				
			||||||
            # hack enables those pages to be crawled, and should hopefully not
 | 
					            # hack enables those pages to be crawled, and should hopefully not
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user