From 66730773977a2602aebd5396efc1c6d8bd7b0ad7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 21 Oct 2021 12:01:41 +0200 Subject: [PATCH] Add kit-ipd crawler --- CHANGELOG.md | 1 + CONFIG.md | 7 ++ PFERD/cli/__init__.py | 1 + PFERD/cli/command_kit_ipd.py | 46 +++++++++++ PFERD/crawl/__init__.py | 3 + PFERD/crawl/kit_ipd_crawler.py | 138 +++++++++++++++++++++++++++++++++ 6 files changed, 196 insertions(+) create mode 100644 PFERD/cli/command_kit_ipd.py create mode 100644 PFERD/crawl/kit_ipd_crawler.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ac3a8d..cca4839 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ ambiguous situations. ### Added - `--skip` command line option - Support for ILIAS booking objects +- A KIT IPD crawler ### Changed - Using multiple path segments on left side of `-name->` now results in an diff --git a/CONFIG.md b/CONFIG.md index 19afbd2..06b9246 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler. requests. (Default: `0.0`) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) +### The `kit-ipd` crawler + +This crawler crals a KIT ipd page by url. The root page can be crawled from +outside the KIT network so you will be informed about any new/deleted files, +but downloading files requires you to be within. Adding a show delay between +requests is likely a good idea. + ### The `kit-ilias-web` crawler This crawler crawls the KIT ILIAS instance. diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py index d70ecd9..efa8f00 100644 --- a/PFERD/cli/__init__.py +++ b/PFERD/cli/__init__.py @@ -9,4 +9,5 @@ from . import command_local # noqa: F401 imported but unused from . import command_kit_ilias_web # noqa: F401 imported but unused +from . import command_kit_ipd # noqa: F401 imported but unused from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py new file mode 100644 index 0000000..480cc9b --- /dev/null +++ b/PFERD/cli/command_kit_ipd.py @@ -0,0 +1,46 @@ +import argparse +import configparser +from pathlib import Path + +from ..logging import log +from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler + +SUBPARSER = SUBPARSERS.add_parser( + "kit-ipd", + parents=[CRAWLER_PARSER], +) + +GROUP = SUBPARSER.add_argument_group( + title="kit ipd crawler arguments", + description="arguments for the 'kit-ipd' crawler", +) +GROUP.add_argument( + "target", + type=str, + metavar="TARGET", + help="url to crawl" +) +GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) + + +def load( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + log.explain("Creating config for command 'kit-ipd'") + + parser["crawl:kit-ipd"] = {} + section = parser["crawl:ipd"] + load_crawler(args, section) + + section["type"] = "kit-ipd" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + + +SUBPARSER.set_defaults(command=load) diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 7eb2fb1..1f8bd59 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -5,6 +5,7 @@ from ..auth import Authenticator from ..config import Config from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection +from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ @@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = { LocalCrawler(n, LocalCrawlerSection(s), c), "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), + "kit-ipd": lambda n, s, c, a: + KitIpdCrawler(n, KitIpdCrawlerSection(s), c), } diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py new file mode 100644 index 0000000..4d4addd --- /dev/null +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -0,0 +1,138 @@ +import os +from dataclasses import dataclass +from pathlib import PurePath +from typing import List, Set, Union +from urllib.parse import urljoin + +from bs4 import BeautifulSoup, Tag + +from ..config import Config +from ..logging import ProgressBar, log +from ..output_dir import FileSink +from ..utils import soupify +from .crawler import CrawlError +from .http_crawler import HttpCrawler, HttpCrawlerSection + + +class KitIpdCrawlerSection(HttpCrawlerSection): + def target(self) -> str: + target = self.s.get("target") + if not target: + self.missing_value("target") + + if not target.startswith("https://"): + self.invalid_value("target", target, "Should be a URL") + + return target + + +@dataclass +class KitIpdFile: + name: str + url: str + + +@dataclass +class KitIpdFolder: + name: str + files: List[KitIpdFile] + + +class KitIpdCrawler(HttpCrawler): + + def __init__( + self, + name: str, + section: KitIpdCrawlerSection, + config: Config, + ): + super().__init__(name, section, config) + self._url = section.target() + + async def _run(self) -> None: + maybe_cl = await self.crawl(PurePath(".")) + if not maybe_cl: + return + + folders: List[KitIpdFolder] = [] + + async with maybe_cl: + folder_tags = await self._fetch_folder_tags() + folders = [self._extract_folder(tag) for tag in folder_tags] + + tasks = [self._crawl_folder(folder) for folder in folders] + + await self.gather(tasks) + + async def _crawl_folder(self, folder: KitIpdFolder) -> None: + path = PurePath(folder.name) + if not await self.crawl(path): + return + + tasks = [self._download_file(path, file) for file in folder.files] + + await self.gather(tasks) + + async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: + element_path = parent / file.name + maybe_dl = await self.download(element_path) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + await self._stream_from_url(file.url, sink, bar) + + async def _fetch_folder_tags(self) -> Set[Tag]: + page = await self.get_page() + elements: List[Tag] = self._find_file_links(page) + folder_tags: Set[Tag] = set() + + for element in elements: + enclosing_data: Tag = element.findParent(name="td") + label: Tag = enclosing_data.findPreviousSibling(name="td") + folder_tags.add(label) + + return folder_tags + + def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: + name = folder_tag.getText().strip() + files: List[KitIpdFile] = [] + + container: Tag = folder_tag.findNextSibling(name="td") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) + + log.explain_topic(f"Found folder {name!r}") + for file in files: + log.explain(f"Found file {file.name!r}") + + return KitIpdFolder(name, files) + + def _extract_file(self, link: Tag) -> KitIpdFile: + name = link.getText().strip() + url = self._abs_url_from_link(link) + _, extension = os.path.splitext(url) + return KitIpdFile(name + extension, url) + + def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: + return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x}) + + def _abs_url_from_link(self, link_tag: Tag) -> str: + return urljoin(self._url, link_tag.get("href")) + + async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async with self.session.get(url, allow_redirects=False) as resp: + if resp.status == 403: + raise CrawlError("Received a 403. Are you within the KIT network/VPN?") + if resp.content_length: + bar.set_total(resp.content_length) + + async for data in resp.content.iter_chunked(1024): + sink.file.write(data) + bar.advance(len(data)) + + sink.done() + + async def get_page(self) -> BeautifulSoup: + async with self.session.get(self._url) as request: + return soupify(await request.read())