Add kit-ipd crawler

2025-07-15 15:32:36 +02:00 · 2021-10-21 12:01:41 +02:00
parent 742632ed8d
commit 6673077397
6 changed files with 196 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@ ambiguous situations.
 ### Added
 - `--skip` command line option
 - Support for ILIAS booking objects
+- A KIT IPD crawler

 ### Changed
 - Using multiple path segments on left side of `-name->` now results in an
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler.
  requests. (Default: `0.0`)
 - `download_speed`: Download speed (in bytes per second) to simulate. (Optional)

+### The `kit-ipd` crawler
+
+This crawler crals a KIT ipd page by url. The root page can be crawled from
+outside the KIT network so you will be informed about any new/deleted files,
+but downloading files requires you to be within. Adding a show delay between
+requests is likely a good idea.
+
 ### The `kit-ilias-web` crawler

 This crawler crawls the KIT ILIAS instance.
--- a/PFERD/cli/init.py
+++ b/PFERD/cli/init.py
@@ -9,4 +9,5 @@

 from . import command_local  # noqa: F401 imported but unused
 from . import command_kit_ilias_web  # noqa: F401 imported but unused
+from . import command_kit_ipd  # noqa: F401 imported but unused
 from .parser import PARSER, ParserLoadError, load_default_section  # noqa: F401 imported but unused
--- a/PFERD/cli/command_kit_ipd.py
+++ b/PFERD/cli/command_kit_ipd.py
@@ -0,0 +1,46 @@
+import argparse
+import configparser
+from pathlib import Path
+
+from ..logging import log
+from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler
+
+SUBPARSER = SUBPARSERS.add_parser(
+    "kit-ipd",
+    parents=[CRAWLER_PARSER],
+)
+
+GROUP = SUBPARSER.add_argument_group(
+    title="kit ipd crawler arguments",
+    description="arguments for the 'kit-ipd' crawler",
+)
+GROUP.add_argument(
+    "target",
+    type=str,
+    metavar="TARGET",
+    help="url to crawl"
+)
+GROUP.add_argument(
+    "output",
+    type=Path,
+    metavar="OUTPUT",
+    help="output directory"
+)
+
+
+def load(
+        args: argparse.Namespace,
+        parser: configparser.ConfigParser,
+) -> None:
+    log.explain("Creating config for command 'kit-ipd'")
+
+    parser["crawl:kit-ipd"] = {}
+    section = parser["crawl:ipd"]
+    load_crawler(args, section)
+
+    section["type"] = "kit-ipd"
+    section["target"] = str(args.target)
+    section["output_dir"] = str(args.output)
+
+
+SUBPARSER.set_defaults(command=load)
--- a/PFERD/crawl/init.py
+++ b/PFERD/crawl/init.py
@@ -5,6 +5,7 @@ from ..auth import Authenticator
 from ..config import Config
 from .crawler import Crawler, CrawlError, CrawlerSection  # noqa: F401
 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
+from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection
 from .local_crawler import LocalCrawler, LocalCrawlerSection

 CrawlerConstructor = Callable[[
@@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = {
        LocalCrawler(n, LocalCrawlerSection(s), c),
    "kit-ilias-web": lambda n, s, c, a:
        KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
+    "kit-ipd": lambda n, s, c, a:
+        KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
 }
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -0,0 +1,138 @@
+import os
+from dataclasses import dataclass
+from pathlib import PurePath
+from typing import List, Set, Union
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup, Tag
+
+from ..config import Config
+from ..logging import ProgressBar, log
+from ..output_dir import FileSink
+from ..utils import soupify
+from .crawler import CrawlError
+from .http_crawler import HttpCrawler, HttpCrawlerSection
+
+
+class KitIpdCrawlerSection(HttpCrawlerSection):
+    def target(self) -> str:
+        target = self.s.get("target")
+        if not target:
+            self.missing_value("target")
+
+        if not target.startswith("https://"):
+            self.invalid_value("target", target, "Should be a URL")
+
+        return target
+
+
+@dataclass
+class KitIpdFile:
+    name: str
+    url: str
+
+
+@dataclass
+class KitIpdFolder:
+    name: str
+    files: List[KitIpdFile]
+
+
+class KitIpdCrawler(HttpCrawler):
+
+    def __init__(
+            self,
+            name: str,
+            section: KitIpdCrawlerSection,
+            config: Config,
+    ):
+        super().__init__(name, section, config)
+        self._url = section.target()
+
+    async def _run(self) -> None:
+        maybe_cl = await self.crawl(PurePath("."))
+        if not maybe_cl:
+            return
+
+        folders: List[KitIpdFolder] = []
+
+        async with maybe_cl:
+            folder_tags = await self._fetch_folder_tags()
+            folders = [self._extract_folder(tag) for tag in folder_tags]
+
+        tasks = [self._crawl_folder(folder) for folder in folders]
+
+        await self.gather(tasks)
+
+    async def _crawl_folder(self, folder: KitIpdFolder) -> None:
+        path = PurePath(folder.name)
+        if not await self.crawl(path):
+            return
+
+        tasks = [self._download_file(path, file) for file in folder.files]
+
+        await self.gather(tasks)
+
+    async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
+        element_path = parent / file.name
+        maybe_dl = await self.download(element_path)
+        if not maybe_dl:
+            return
+
+        async with maybe_dl as (bar, sink):
+            await self._stream_from_url(file.url, sink, bar)
+
+    async def _fetch_folder_tags(self) -> Set[Tag]:
+        page = await self.get_page()
+        elements: List[Tag] = self._find_file_links(page)
+        folder_tags: Set[Tag] = set()
+
+        for element in elements:
+            enclosing_data: Tag = element.findParent(name="td")
+            label: Tag = enclosing_data.findPreviousSibling(name="td")
+            folder_tags.add(label)
+
+        return folder_tags
+
+    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
+        name = folder_tag.getText().strip()
+        files: List[KitIpdFile] = []
+
+        container: Tag = folder_tag.findNextSibling(name="td")
+        for link in self._find_file_links(container):
+            files.append(self._extract_file(link))
+
+        log.explain_topic(f"Found folder {name!r}")
+        for file in files:
+            log.explain(f"Found file {file.name!r}")
+
+        return KitIpdFolder(name, files)
+
+    def _extract_file(self, link: Tag) -> KitIpdFile:
+        name = link.getText().strip()
+        url = self._abs_url_from_link(link)
+        _, extension = os.path.splitext(url)
+        return KitIpdFile(name + extension, url)
+
+    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
+        return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})
+
+    def _abs_url_from_link(self, link_tag: Tag) -> str:
+        return urljoin(self._url, link_tag.get("href"))
+
+    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
+        async with self.session.get(url, allow_redirects=False) as resp:
+            if resp.status == 403:
+                raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
+            if resp.content_length:
+                bar.set_total(resp.content_length)
+
+            async for data in resp.content.iter_chunked(1024):
+                sink.file.write(data)
+                bar.advance(len(data))
+
+            sink.done()
+
+    async def get_page(self) -> BeautifulSoup:
+        async with self.session.get(self._url) as request:
+            return soupify(await request.read())