mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add kit-ipd crawler
This commit is contained in:
parent
742632ed8d
commit
6673077397
@ -27,6 +27,7 @@ ambiguous situations.
|
|||||||
### Added
|
### Added
|
||||||
- `--skip` command line option
|
- `--skip` command line option
|
||||||
- Support for ILIAS booking objects
|
- Support for ILIAS booking objects
|
||||||
|
- A KIT IPD crawler
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Using multiple path segments on left side of `-name->` now results in an
|
- Using multiple path segments on left side of `-name->` now results in an
|
||||||
|
@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler.
|
|||||||
requests. (Default: `0.0`)
|
requests. (Default: `0.0`)
|
||||||
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
|
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
|
||||||
|
|
||||||
|
### The `kit-ipd` crawler
|
||||||
|
|
||||||
|
This crawler crals a KIT ipd page by url. The root page can be crawled from
|
||||||
|
outside the KIT network so you will be informed about any new/deleted files,
|
||||||
|
but downloading files requires you to be within. Adding a show delay between
|
||||||
|
requests is likely a good idea.
|
||||||
|
|
||||||
### The `kit-ilias-web` crawler
|
### The `kit-ilias-web` crawler
|
||||||
|
|
||||||
This crawler crawls the KIT ILIAS instance.
|
This crawler crawls the KIT ILIAS instance.
|
||||||
|
@ -9,4 +9,5 @@
|
|||||||
|
|
||||||
from . import command_local # noqa: F401 imported but unused
|
from . import command_local # noqa: F401 imported but unused
|
||||||
from . import command_kit_ilias_web # noqa: F401 imported but unused
|
from . import command_kit_ilias_web # noqa: F401 imported but unused
|
||||||
|
from . import command_kit_ipd # noqa: F401 imported but unused
|
||||||
from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused
|
from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused
|
||||||
|
46
PFERD/cli/command_kit_ipd.py
Normal file
46
PFERD/cli/command_kit_ipd.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import argparse
|
||||||
|
import configparser
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..logging import log
|
||||||
|
from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler
|
||||||
|
|
||||||
|
SUBPARSER = SUBPARSERS.add_parser(
|
||||||
|
"kit-ipd",
|
||||||
|
parents=[CRAWLER_PARSER],
|
||||||
|
)
|
||||||
|
|
||||||
|
GROUP = SUBPARSER.add_argument_group(
|
||||||
|
title="kit ipd crawler arguments",
|
||||||
|
description="arguments for the 'kit-ipd' crawler",
|
||||||
|
)
|
||||||
|
GROUP.add_argument(
|
||||||
|
"target",
|
||||||
|
type=str,
|
||||||
|
metavar="TARGET",
|
||||||
|
help="url to crawl"
|
||||||
|
)
|
||||||
|
GROUP.add_argument(
|
||||||
|
"output",
|
||||||
|
type=Path,
|
||||||
|
metavar="OUTPUT",
|
||||||
|
help="output directory"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load(
|
||||||
|
args: argparse.Namespace,
|
||||||
|
parser: configparser.ConfigParser,
|
||||||
|
) -> None:
|
||||||
|
log.explain("Creating config for command 'kit-ipd'")
|
||||||
|
|
||||||
|
parser["crawl:kit-ipd"] = {}
|
||||||
|
section = parser["crawl:ipd"]
|
||||||
|
load_crawler(args, section)
|
||||||
|
|
||||||
|
section["type"] = "kit-ipd"
|
||||||
|
section["target"] = str(args.target)
|
||||||
|
section["output_dir"] = str(args.output)
|
||||||
|
|
||||||
|
|
||||||
|
SUBPARSER.set_defaults(command=load)
|
@ -5,6 +5,7 @@ from ..auth import Authenticator
|
|||||||
from ..config import Config
|
from ..config import Config
|
||||||
from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
|
from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
|
||||||
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
|
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
|
||||||
|
from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection
|
||||||
from .local_crawler import LocalCrawler, LocalCrawlerSection
|
from .local_crawler import LocalCrawler, LocalCrawlerSection
|
||||||
|
|
||||||
CrawlerConstructor = Callable[[
|
CrawlerConstructor = Callable[[
|
||||||
@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = {
|
|||||||
LocalCrawler(n, LocalCrawlerSection(s), c),
|
LocalCrawler(n, LocalCrawlerSection(s), c),
|
||||||
"kit-ilias-web": lambda n, s, c, a:
|
"kit-ilias-web": lambda n, s, c, a:
|
||||||
KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
|
KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
|
||||||
|
"kit-ipd": lambda n, s, c, a:
|
||||||
|
KitIpdCrawler(n, KitIpdCrawlerSection(s), c),
|
||||||
}
|
}
|
||||||
|
138
PFERD/crawl/kit_ipd_crawler.py
Normal file
138
PFERD/crawl/kit_ipd_crawler.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import PurePath
|
||||||
|
from typing import List, Set, Union
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from ..config import Config
|
||||||
|
from ..logging import ProgressBar, log
|
||||||
|
from ..output_dir import FileSink
|
||||||
|
from ..utils import soupify
|
||||||
|
from .crawler import CrawlError
|
||||||
|
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
|
|
||||||
|
|
||||||
|
class KitIpdCrawlerSection(HttpCrawlerSection):
|
||||||
|
def target(self) -> str:
|
||||||
|
target = self.s.get("target")
|
||||||
|
if not target:
|
||||||
|
self.missing_value("target")
|
||||||
|
|
||||||
|
if not target.startswith("https://"):
|
||||||
|
self.invalid_value("target", target, "Should be a URL")
|
||||||
|
|
||||||
|
return target
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class KitIpdFile:
|
||||||
|
name: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class KitIpdFolder:
|
||||||
|
name: str
|
||||||
|
files: List[KitIpdFile]
|
||||||
|
|
||||||
|
|
||||||
|
class KitIpdCrawler(HttpCrawler):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
section: KitIpdCrawlerSection,
|
||||||
|
config: Config,
|
||||||
|
):
|
||||||
|
super().__init__(name, section, config)
|
||||||
|
self._url = section.target()
|
||||||
|
|
||||||
|
async def _run(self) -> None:
|
||||||
|
maybe_cl = await self.crawl(PurePath("."))
|
||||||
|
if not maybe_cl:
|
||||||
|
return
|
||||||
|
|
||||||
|
folders: List[KitIpdFolder] = []
|
||||||
|
|
||||||
|
async with maybe_cl:
|
||||||
|
folder_tags = await self._fetch_folder_tags()
|
||||||
|
folders = [self._extract_folder(tag) for tag in folder_tags]
|
||||||
|
|
||||||
|
tasks = [self._crawl_folder(folder) for folder in folders]
|
||||||
|
|
||||||
|
await self.gather(tasks)
|
||||||
|
|
||||||
|
async def _crawl_folder(self, folder: KitIpdFolder) -> None:
|
||||||
|
path = PurePath(folder.name)
|
||||||
|
if not await self.crawl(path):
|
||||||
|
return
|
||||||
|
|
||||||
|
tasks = [self._download_file(path, file) for file in folder.files]
|
||||||
|
|
||||||
|
await self.gather(tasks)
|
||||||
|
|
||||||
|
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
||||||
|
element_path = parent / file.name
|
||||||
|
maybe_dl = await self.download(element_path)
|
||||||
|
if not maybe_dl:
|
||||||
|
return
|
||||||
|
|
||||||
|
async with maybe_dl as (bar, sink):
|
||||||
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
|
async def _fetch_folder_tags(self) -> Set[Tag]:
|
||||||
|
page = await self.get_page()
|
||||||
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
|
folder_tags: Set[Tag] = set()
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
enclosing_data: Tag = element.findParent(name="td")
|
||||||
|
label: Tag = enclosing_data.findPreviousSibling(name="td")
|
||||||
|
folder_tags.add(label)
|
||||||
|
|
||||||
|
return folder_tags
|
||||||
|
|
||||||
|
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||||
|
name = folder_tag.getText().strip()
|
||||||
|
files: List[KitIpdFile] = []
|
||||||
|
|
||||||
|
container: Tag = folder_tag.findNextSibling(name="td")
|
||||||
|
for link in self._find_file_links(container):
|
||||||
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
|
log.explain_topic(f"Found folder {name!r}")
|
||||||
|
for file in files:
|
||||||
|
log.explain(f"Found file {file.name!r}")
|
||||||
|
|
||||||
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
|
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||||
|
name = link.getText().strip()
|
||||||
|
url = self._abs_url_from_link(link)
|
||||||
|
_, extension = os.path.splitext(url)
|
||||||
|
return KitIpdFile(name + extension, url)
|
||||||
|
|
||||||
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
|
return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})
|
||||||
|
|
||||||
|
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||||
|
return urljoin(self._url, link_tag.get("href"))
|
||||||
|
|
||||||
|
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
|
async with self.session.get(url, allow_redirects=False) as resp:
|
||||||
|
if resp.status == 403:
|
||||||
|
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
|
||||||
|
if resp.content_length:
|
||||||
|
bar.set_total(resp.content_length)
|
||||||
|
|
||||||
|
async for data in resp.content.iter_chunked(1024):
|
||||||
|
sink.file.write(data)
|
||||||
|
bar.advance(len(data))
|
||||||
|
|
||||||
|
sink.done()
|
||||||
|
|
||||||
|
async def get_page(self) -> BeautifulSoup:
|
||||||
|
async with self.session.get(self._url) as request:
|
||||||
|
return soupify(await request.read())
|
Loading…
Reference in New Issue
Block a user