From 13b8c3d9c6c59ab2714e2670506d89c5a2cb6eb6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 2 Nov 2021 09:30:46 +0100 Subject: [PATCH] Add regex option to config and CLI parser --- CONFIG.md | 7 ++++++- LICENSE | 3 ++- PFERD/cli/command_kit_ipd.py | 8 ++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 8ccaa50..569780d 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -138,11 +138,16 @@ crawler simulate a slower, network-based crawler. ### The `kit-ipd` crawler -This crawler crawls a KIT ipd page by url. The root page can be crawled from +This crawler crawls a KIT-IPD page by url. The root page can be crawled from outside the KIT network so you will be informed about any new/deleted files, but downloading files requires you to be within. Adding a show delay between requests is likely a good idea. +- `target`: URL to a KIT-IPD page +- `link_regex`: A regex that is matched against the `href` part of links. If it + matches, the given link is downloaded as a file. This is used to extract + files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`) + ### The `kit-ilias-web` crawler This crawler crawls the KIT ILIAS instance. diff --git a/LICENSE b/LICENSE index c096c4a..fe2293f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ -Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst +Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, + TheChristophe, Scriptim, thelukasprobst, Toorero Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py index c4c593f..b53e67e 100644 --- a/PFERD/cli/command_kit_ipd.py +++ b/PFERD/cli/command_kit_ipd.py @@ -14,6 +14,12 @@ GROUP = SUBPARSER.add_argument_group( title="kit ipd crawler arguments", description="arguments for the 'kit-ipd' crawler", ) +GROUP.add_argument( + "--link-regex", + type=str, + metavar="REGEX", + help="href-matching regex to identify downloadable files" +) GROUP.add_argument( "target", type=str, @@ -41,6 +47,8 @@ def load( section["type"] = "kit-ipd" section["target"] = str(args.target) section["output_dir"] = str(args.output) + if args.link_regex: + section["link_regex"] = str(args.link_regex) SUBPARSER.set_defaults(command=load)