Add kit-ipd crawler

This commit is contained in:
I-Al-Istannen 2021-10-21 12:01:41 +02:00
parent 742632ed8d
commit 6673077397
6 changed files with 196 additions and 0 deletions

View File

@ -27,6 +27,7 @@ ambiguous situations.
### Added
- `--skip` command line option
- Support for ILIAS booking objects
- A KIT IPD crawler
### Changed
- Using multiple path segments on left side of `-name->` now results in an

View File

@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler.
requests. (Default: `0.0`)
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
### The `kit-ipd` crawler
This crawler crals a KIT ipd page by url. The root page can be crawled from
outside the KIT network so you will be informed about any new/deleted files,
but downloading files requires you to be within. Adding a show delay between
requests is likely a good idea.
### The `kit-ilias-web` crawler
This crawler crawls the KIT ILIAS instance.

View File

@ -9,4 +9,5 @@
from . import command_local # noqa: F401 imported but unused
from . import command_kit_ilias_web # noqa: F401 imported but unused
from . import command_kit_ipd # noqa: F401 imported but unused
from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused

View File

@ -0,0 +1,46 @@
import argparse
import configparser
from pathlib import Path
from ..logging import log
from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler
SUBPARSER = SUBPARSERS.add_parser(
"kit-ipd",
parents=[CRAWLER_PARSER],
)
GROUP = SUBPARSER.add_argument_group(
title="kit ipd crawler arguments",
description="arguments for the 'kit-ipd' crawler",
)
GROUP.add_argument(
"target",
type=str,
metavar="TARGET",
help="url to crawl"
)
GROUP.add_argument(
"output",
type=Path,
metavar="OUTPUT",
help="output directory"
)
def load(
args: argparse.Namespace,
parser: configparser.ConfigParser,
) -> None:
log.explain("Creating config for command 'kit-ipd'")
parser["crawl:kit-ipd"] = {}
section = parser["crawl:ipd"]
load_crawler(args, section)
section["type"] = "kit-ipd"
section["target"] = str(args.target)
section["output_dir"] = str(args.output)
SUBPARSER.set_defaults(command=load)

View File

@ -5,6 +5,7 @@ from ..auth import Authenticator
from ..config import Config
from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection
from .local_crawler import LocalCrawler, LocalCrawlerSection
CrawlerConstructor = Callable[[
@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = {
LocalCrawler(n, LocalCrawlerSection(s), c),
"kit-ilias-web": lambda n, s, c, a:
KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
"kit-ipd": lambda n, s, c, a:
KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
}

View File

@ -0,0 +1,138 @@
import os
from dataclasses import dataclass
from pathlib import PurePath
from typing import List, Set, Union
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag
from ..config import Config
from ..logging import ProgressBar, log
from ..output_dir import FileSink
from ..utils import soupify
from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection
class KitIpdCrawlerSection(HttpCrawlerSection):
def target(self) -> str:
target = self.s.get("target")
if not target:
self.missing_value("target")
if not target.startswith("https://"):
self.invalid_value("target", target, "Should be a URL")
return target
@dataclass
class KitIpdFile:
name: str
url: str
@dataclass
class KitIpdFolder:
name: str
files: List[KitIpdFile]
class KitIpdCrawler(HttpCrawler):
def __init__(
self,
name: str,
section: KitIpdCrawlerSection,
config: Config,
):
super().__init__(name, section, config)
self._url = section.target()
async def _run(self) -> None:
maybe_cl = await self.crawl(PurePath("."))
if not maybe_cl:
return
folders: List[KitIpdFolder] = []
async with maybe_cl:
folder_tags = await self._fetch_folder_tags()
folders = [self._extract_folder(tag) for tag in folder_tags]
tasks = [self._crawl_folder(folder) for folder in folders]
await self.gather(tasks)
async def _crawl_folder(self, folder: KitIpdFolder) -> None:
path = PurePath(folder.name)
if not await self.crawl(path):
return
tasks = [self._download_file(path, file) for file in folder.files]
await self.gather(tasks)
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name
maybe_dl = await self.download(element_path)
if not maybe_dl:
return
async with maybe_dl as (bar, sink):
await self._stream_from_url(file.url, sink, bar)
async def _fetch_folder_tags(self) -> Set[Tag]:
page = await self.get_page()
elements: List[Tag] = self._find_file_links(page)
folder_tags: Set[Tag] = set()
for element in elements:
enclosing_data: Tag = element.findParent(name="td")
label: Tag = enclosing_data.findPreviousSibling(name="td")
folder_tags.add(label)
return folder_tags
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
name = folder_tag.getText().strip()
files: List[KitIpdFile] = []
container: Tag = folder_tag.findNextSibling(name="td")
for link in self._find_file_links(container):
files.append(self._extract_file(link))
log.explain_topic(f"Found folder {name!r}")
for file in files:
log.explain(f"Found file {file.name!r}")
return KitIpdFolder(name, files)
def _extract_file(self, link: Tag) -> KitIpdFile:
name = link.getText().strip()
url = self._abs_url_from_link(link)
_, extension = os.path.splitext(url)
return KitIpdFile(name + extension, url)
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})
def _abs_url_from_link(self, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href"))
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp:
if resp.status == 403:
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
if resp.content_length:
bar.set_total(resp.content_length)
async for data in resp.content.iter_chunked(1024):
sink.file.write(data)
bar.advance(len(data))
sink.done()
async def get_page(self) -> BeautifulSoup:
async with self.session.get(self._url) as request:
return soupify(await request.read())