Add kit-ipd crawler

2026-02-24 17:22:24 +01:00 · 2021-10-21 12:01:41 +02:00
parent 742632ed8d
commit 6673077397
6 changed files with 196 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@ ambiguous situations.
 ### Added
 - `--skip` command line option
 - Support for ILIAS booking objects
 - A KIT IPD crawler
 ### Changed
 - Using multiple path segments on left side of `-name->` now results in an
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler.
  requests. (Default: `0.0`)
 - `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
 ### The `kit-ipd` crawler
 This crawler crals a KIT ipd page by url. The root page can be crawled from
 outside the KIT network so you will be informed about any new/deleted files,
 but downloading files requires you to be within. Adding a show delay between
 requests is likely a good idea.
 ### The `kit-ilias-web` crawler
 This crawler crawls the KIT ILIAS instance.
--- a/PFERD/cli/init.py
+++ b/PFERD/cli/init.py
@@ -9,4 +9,5 @@
 from . import command_local  # noqa: F401 imported but unused
 from . import command_kit_ilias_web  # noqa: F401 imported but unused
 from . import command_kit_ipd  # noqa: F401 imported but unused
 from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused
--- a/PFERD/cli/command_kit_ipd.py
+++ b/PFERD/cli/command_kit_ipd.py
@@ -0,0 +1,46 @@
 import argparse
 import configparser
 from pathlib import Path
 from ..logging import log
 from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler
 SUBPARSER = SUBPARSERS.add_parser(
    "kit-ipd",
    parents=[CRAWLER_PARSER],
 )
 GROUP = SUBPARSER.add_argument_group(
    title="kit ipd crawler arguments",
    description="arguments for the 'kit-ipd' crawler",
 )
 GROUP.add_argument(
    "target",
    type=str,
    metavar="TARGET",
    help="url to crawl"
 )
 GROUP.add_argument(
    "output",
    type=Path,
    metavar="OUTPUT",
    help="output directory"
 )
 def load(
        args: argparse.Namespace,
        parser: configparser.ConfigParser,
 ) -> None:
    log.explain("Creating config for command 'kit-ipd'")
    parser["crawl:kit-ipd"] = {}
    section = parser["crawl:ipd"]
    load_crawler(args, section)
    section["type"] = "kit-ipd"
    section["target"] = str(args.target)
    section["output_dir"] = str(args.output)
 SUBPARSER.set_defaults(command=load)
--- a/PFERD/crawl/init.py
+++ b/PFERD/crawl/init.py
@@ -5,6 +5,7 @@ from ..auth import Authenticator
 from ..config import Config
 from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401
 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
 from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection
 from .local_crawler import LocalCrawler, LocalCrawlerSection
 CrawlerConstructor = Callable[[
@@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = {
        LocalCrawler(n, LocalCrawlerSection(s), c),
    "kit-ilias-web": lambda n, s, c, a:
        KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
    "kit-ipd": lambda n, s, c, a:
        KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
 }
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -0,0 +1,138 @@
 import os
 from dataclasses import dataclass
 from pathlib import PurePath
 from typing import List, Set, Union
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, Tag
 from ..config import Config
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink
 from ..utils import soupify
 from .crawler import CrawlError
 from .http_crawler import HttpCrawler, HttpCrawlerSection
 class KitIpdCrawlerSection(HttpCrawlerSection):
    def target(self) -> str:
        target = self.s.get("target")
        if not target:
            self.missing_value("target")
        if not target.startswith("https://"):
            self.invalid_value("target", target, "Should be a URL")
        return target
@dataclass
 class KitIpdFile:
    name: str
    url: str
@dataclass
 class KitIpdFolder:
    name: str
    files: List[KitIpdFile]
 class KitIpdCrawler(HttpCrawler):
    def __init__(
            self,
            name: str,
            section: KitIpdCrawlerSection,
            config: Config,
    ):
        super().__init__(name, section, config)
        self._url = section.target()
    async def _run(self) -> None:
        maybe_cl = await self.crawl(PurePath("."))
        if not maybe_cl:
            return
        folders: List[KitIpdFolder] = []
        async with maybe_cl:
            folder_tags = await self._fetch_folder_tags()
            folders = [self._extract_folder(tag) for tag in folder_tags]
        tasks = [self._crawl_folder(folder) for folder in folders]
        await self.gather(tasks)
    async def _crawl_folder(self, folder: KitIpdFolder) -> None:
        path = PurePath(folder.name)
        if not await self.crawl(path):
            return
        tasks = [self._download_file(path, file) for file in folder.files]
        await self.gather(tasks)
    async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
        element_path = parent / file.name
        maybe_dl = await self.download(element_path)
        if not maybe_dl:
            return
        async with maybe_dl as (bar, sink):
            await self._stream_from_url(file.url, sink, bar)
    async def _fetch_folder_tags(self) -> Set[Tag]:
        page = await self.get_page()
        elements: List[Tag] = self._find_file_links(page)
        folder_tags: Set[Tag] = set()
        for element in elements:
            enclosing_data: Tag = element.findParent(name="td")
            label: Tag = enclosing_data.findPreviousSibling(name="td")
            folder_tags.add(label)
        return folder_tags
    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
        name = folder_tag.getText().strip()
        files: List[KitIpdFile] = []
        container: Tag = folder_tag.findNextSibling(name="td")
        for link in self._find_file_links(container):
            files.append(self._extract_file(link))
        log.explain_topic(f"Found folder {name!r}")
        for file in files:
            log.explain(f"Found file {file.name!r}")
        return KitIpdFolder(name, files)
    def _extract_file(self, link: Tag) -> KitIpdFile:
        name = link.getText().strip()
        url = self._abs_url_from_link(link)
        _, extension = os.path.splitext(url)
        return KitIpdFile(name + extension, url)
    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
        return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        return urljoin(self._url, link_tag.get("href"))
    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
            if resp.status == 403:
                raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
            if resp.content_length:
                bar.set_total(resp.content_length)
            async for data in resp.content.iter_chunked(1024):
                sink.file.write(data)
                bar.advance(len(data))
            sink.done()
    async def get_page(self) -> BeautifulSoup:
        async with self.session.get(self._url) as request:
            return soupify(await request.read())