mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add regex option to config and CLI parser
This commit is contained in:
parent
88afe64a92
commit
13b8c3d9c6
@ -138,11 +138,16 @@ crawler simulate a slower, network-based crawler.
|
|||||||
|
|
||||||
### The `kit-ipd` crawler
|
### The `kit-ipd` crawler
|
||||||
|
|
||||||
This crawler crawls a KIT ipd page by url. The root page can be crawled from
|
This crawler crawls a KIT-IPD page by url. The root page can be crawled from
|
||||||
outside the KIT network so you will be informed about any new/deleted files,
|
outside the KIT network so you will be informed about any new/deleted files,
|
||||||
but downloading files requires you to be within. Adding a show delay between
|
but downloading files requires you to be within. Adding a show delay between
|
||||||
requests is likely a good idea.
|
requests is likely a good idea.
|
||||||
|
|
||||||
|
- `target`: URL to a KIT-IPD page
|
||||||
|
- `link_regex`: A regex that is matched against the `href` part of links. If it
|
||||||
|
matches, the given link is downloaded as a file. This is used to extract
|
||||||
|
files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`)
|
||||||
|
|
||||||
### The `kit-ilias-web` crawler
|
### The `kit-ilias-web` crawler
|
||||||
|
|
||||||
This crawler crawls the KIT ILIAS instance.
|
This crawler crawls the KIT ILIAS instance.
|
||||||
|
3
LICENSE
3
LICENSE
@ -1,4 +1,5 @@
|
|||||||
Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst
|
Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw,
|
||||||
|
TheChristophe, Scriptim, thelukasprobst, Toorero
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
this software and associated documentation files (the "Software"), to deal in
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
@ -14,6 +14,12 @@ GROUP = SUBPARSER.add_argument_group(
|
|||||||
title="kit ipd crawler arguments",
|
title="kit ipd crawler arguments",
|
||||||
description="arguments for the 'kit-ipd' crawler",
|
description="arguments for the 'kit-ipd' crawler",
|
||||||
)
|
)
|
||||||
|
GROUP.add_argument(
|
||||||
|
"--link-regex",
|
||||||
|
type=str,
|
||||||
|
metavar="REGEX",
|
||||||
|
help="href-matching regex to identify downloadable files"
|
||||||
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"target",
|
"target",
|
||||||
type=str,
|
type=str,
|
||||||
@ -41,6 +47,8 @@ def load(
|
|||||||
section["type"] = "kit-ipd"
|
section["type"] = "kit-ipd"
|
||||||
section["target"] = str(args.target)
|
section["target"] = str(args.target)
|
||||||
section["output_dir"] = str(args.output)
|
section["output_dir"] = str(args.output)
|
||||||
|
if args.link_regex:
|
||||||
|
section["link_regex"] = str(args.link_regex)
|
||||||
|
|
||||||
|
|
||||||
SUBPARSER.set_defaults(command=load)
|
SUBPARSER.set_defaults(command=load)
|
||||||
|
Loading…
Reference in New Issue
Block a user