mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Overhaul config and CLI option names
This commit is contained in:
parent
eb8b915813
commit
61430c8739
138
CONFIG.md
138
CONFIG.md
@ -1,10 +1,11 @@
|
|||||||
# Config file format
|
# Config file format
|
||||||
|
|
||||||
A config file consists of sections. A section begins with a `[section]` header,
|
A config file consists of sections. A section begins with a `[section]` header,
|
||||||
which is followed by a list of `key = value` or `key: value` pairs. Comments
|
which is followed by a list of `key = value` pairs. Comments must be on their
|
||||||
must be on their own line and start with `#` or `;`. Multiline values must be
|
own line and start with `#`. Multiline values must be indented beyond their key.
|
||||||
indented beyond their key. For more details and some examples on the format, see
|
Boolean values can be `yes` or `no`. For more details and some examples on the
|
||||||
the [configparser documentation][1] ([basic interpolation][2] is enabled).
|
format, see the [configparser documentation][1] ([basic interpolation][2] is
|
||||||
|
enabled).
|
||||||
|
|
||||||
[1]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure"
|
[1]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure"
|
||||||
[2]: <https://docs.python.org/3/library/configparser.html#configparser.BasicInterpolation> "BasicInterpolation"
|
[2]: <https://docs.python.org/3/library/configparser.html#configparser.BasicInterpolation> "BasicInterpolation"
|
||||||
@ -15,21 +16,19 @@ This section contains global configuration values. It can also be used to set
|
|||||||
default values for the other sections.
|
default values for the other sections.
|
||||||
|
|
||||||
- `working_dir`: The directory PFERD operates in. Set to an absolute path to
|
- `working_dir`: The directory PFERD operates in. Set to an absolute path to
|
||||||
make PFERD operate the same regardless of where it is executed. All other
|
make PFERD operate the same regardless of where it is executed from. All other
|
||||||
paths in the config file are interpreted relative to this path. If this path
|
paths in the config file are interpreted relative to this path. If this path
|
||||||
is relative, it is interpreted relative to the script's working dir. `~` is
|
is relative, it is interpreted relative to the script's working dir. `~` is
|
||||||
expanded to the current user's home directory. (Default: `.`)
|
expanded to the current user's home directory. (Default: `.`)
|
||||||
- `explain`: Whether PFERD should log and explain its actions and decisions in
|
- `explain`: Whether PFERD should log and explain its actions and decisions in
|
||||||
detail. (Default: `no`)
|
detail. (Default: `no`)
|
||||||
- `status`: Whether PFERD should print status updates while crawling. (Default:
|
- `status`: Whether PFERD should print status updates (like `Crawled ...`,
|
||||||
`yes`)
|
`Added ...`) while running a crawler. (Default: `yes`)
|
||||||
- `report`: Whether PFERD should print a report of added, changed and deleted
|
- `report`: Whether PFERD should print a report of added, changed and deleted
|
||||||
local files for all crawlers before exiting. (Default: `yes`)
|
local files for all crawlers before exiting. (Default: `yes`)
|
||||||
- `share_cookies`: Whether crawlers should share cookies where applicable. By
|
- `share_cookies`: Whether crawlers should share cookies where applicable. For
|
||||||
default, crawlers are isolated and don't interact with each other. This
|
example, some crawlers share cookies if they crawl the same website using the
|
||||||
includes their cookies. However, in situations where multiple crawlers crawl
|
same account. (Default: `yes`)
|
||||||
the same website using the same account, sharing cookies between crawlers can
|
|
||||||
make sense. (Default: `yes`)
|
|
||||||
|
|
||||||
## The `crawl:*` sections
|
## The `crawl:*` sections
|
||||||
|
|
||||||
@ -42,17 +41,17 @@ courses or lecture websites.
|
|||||||
|
|
||||||
Each crawl section represents an instance of a specific type of crawler. The
|
Each crawl section represents an instance of a specific type of crawler. The
|
||||||
`type` option is used to specify the crawler type. The crawler's name is usually
|
`type` option is used to specify the crawler type. The crawler's name is usually
|
||||||
used as the name for the output directory. New crawlers can be created simply by
|
used as the output directory. New crawlers can be created simply by adding a new
|
||||||
adding a new crawl section to the config file.
|
crawl section to the config file.
|
||||||
|
|
||||||
Depending on a crawler's type, it may have different options. For more details,
|
Depending on a crawler's type, it may have different options. For more details,
|
||||||
see the type's documentation below. The following options are common to all
|
see the type's [documentation](#crawler-types) below. The following options are
|
||||||
crawlers:
|
common to all crawlers:
|
||||||
|
|
||||||
- `type`: The types are specified in [this section](#crawler-types).
|
- `type`: The available types are specified in [this section](#crawler-types).
|
||||||
- `output_dir`: The directory the crawler synchronizes files to. A crawler will
|
- `output_dir`: The directory the crawler synchronizes files to. A crawler will
|
||||||
never place any files outside of this directory. (Default: the crawler's name)
|
never place any files outside of this directory. (Default: the crawler's name)
|
||||||
- `redownload`: When to download again a file that is already present locally.
|
- `redownload`: When to download a file that is already present locally.
|
||||||
(Default: `never-smart`)
|
(Default: `never-smart`)
|
||||||
- `never`: If a file is present locally, it is not downloaded again.
|
- `never`: If a file is present locally, it is not downloaded again.
|
||||||
- `never-smart`: Like `never`, but PFERD tries to detect if an already
|
- `never-smart`: Like `never`, but PFERD tries to detect if an already
|
||||||
@ -62,8 +61,8 @@ crawlers:
|
|||||||
- `always-smart`: Like `always`, but PFERD tries to avoid unnecessary
|
- `always-smart`: Like `always`, but PFERD tries to avoid unnecessary
|
||||||
downloads via some (unreliable) heuristics.
|
downloads via some (unreliable) heuristics.
|
||||||
- `on_conflict`: What to do when the local and remote versions of a file or
|
- `on_conflict`: What to do when the local and remote versions of a file or
|
||||||
directory differ. Includes the cases where a file is replaced by a directory
|
directory differ, including when a file is replaced by a directory or a
|
||||||
or a directory by a file. (Default: `prompt`)
|
directory by a file. (Default: `prompt`)
|
||||||
- `prompt`: Always ask the user before overwriting or deleting local files
|
- `prompt`: Always ask the user before overwriting or deleting local files
|
||||||
and directories.
|
and directories.
|
||||||
- `local-first`: Always keep the local file or directory. Equivalent to
|
- `local-first`: Always keep the local file or directory. Equivalent to
|
||||||
@ -75,14 +74,13 @@ crawlers:
|
|||||||
remote file is different.
|
remote file is different.
|
||||||
- `transform`: Rules for renaming and excluding certain files and directories.
|
- `transform`: Rules for renaming and excluding certain files and directories.
|
||||||
For more details, see [this section](#transformation-rules). (Default: empty)
|
For more details, see [this section](#transformation-rules). (Default: empty)
|
||||||
- `max_concurrent_tasks`: The maximum number of concurrent tasks (such as
|
- `tasks`: The maximum number of concurrent tasks (such as crawling or
|
||||||
crawling or downloading). (Default: 1)
|
downloading). (Default: `1`)
|
||||||
- `max_concurrent_downloads`: How many of those tasks can be download tasks at
|
- `downloads`: How many of those tasks can be download tasks at the same time.
|
||||||
the same time. Must not be greater than `max_concurrent_tasks`. When not set,
|
Must not be greater than `tasks`. (Default: Same as `tasks`)
|
||||||
this is the same as `max_concurrent_tasks`. (Optional)
|
- `task_delay`: Time (in seconds) that the crawler should wait between
|
||||||
- `delay_between_tasks`: Time (in seconds) that the crawler should wait between
|
|
||||||
subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
|
subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary
|
||||||
load for the crawl target. (Default: 0.0)
|
load for the crawl target. (Default: `0.0`)
|
||||||
- `windows_paths`: Whether PFERD should find alternative names for paths that
|
- `windows_paths`: Whether PFERD should find alternative names for paths that
|
||||||
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
|
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
|
||||||
|
|
||||||
@ -101,6 +99,8 @@ password = bar
|
|||||||
[crawl:something]
|
[crawl:something]
|
||||||
type = some-complex-crawler
|
type = some-complex-crawler
|
||||||
auth = auth:example
|
auth = auth:example
|
||||||
|
on_conflict = no-delete
|
||||||
|
tasks = 3
|
||||||
```
|
```
|
||||||
|
|
||||||
## The `auth:*` sections
|
## The `auth:*` sections
|
||||||
@ -109,12 +109,12 @@ Sections whose names start with `auth:` are used to configure authenticators. An
|
|||||||
authenticator provides a username and a password to one or more crawlers.
|
authenticator provides a username and a password to one or more crawlers.
|
||||||
|
|
||||||
Authenticators work similar to crawlers: A section represents an authenticator
|
Authenticators work similar to crawlers: A section represents an authenticator
|
||||||
instance, whose name is the rest of the section name. The type is specified by
|
instance whose name is the rest of the section name. The type is specified by
|
||||||
the `type` option.
|
the `type` option.
|
||||||
|
|
||||||
Depending on an authenticator's type, it may have different options. For more
|
Depending on an authenticator's type, it may have different options. For more
|
||||||
details, see the type's documentation below. The only option common to all
|
details, see the type's [documentation](#authenticator-types) below. The only
|
||||||
authenticators is `type`:
|
option common to all authenticators is `type`:
|
||||||
|
|
||||||
- `type`: The types are specified in [this section](#authenticator-types).
|
- `type`: The types are specified in [this section](#authenticator-types).
|
||||||
|
|
||||||
@ -127,28 +127,47 @@ testing different setups. The various delay options are meant to make the
|
|||||||
crawler simulate a slower, network-based crawler.
|
crawler simulate a slower, network-based crawler.
|
||||||
|
|
||||||
- `target`: Path to the local directory to crawl. (Required)
|
- `target`: Path to the local directory to crawl. (Required)
|
||||||
- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl
|
- `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests.
|
||||||
requests. (Default: 0.0)
|
(Default: `0.0`)
|
||||||
- `download_delay`: Maximum artificial delay (in seconds) to simulate for
|
- `download_delay`: Artificial delay (in seconds) to simulate for download
|
||||||
download requests. (Default: 0.0)
|
requests. (Default: `0.0`)
|
||||||
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
|
- `download_speed`: Download speed (in bytes per second) to simulate. (Optional)
|
||||||
|
|
||||||
### The `kit-ilias` crawler
|
### The `kit-ilias-web` crawler
|
||||||
|
|
||||||
This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor SCC-Server, so you should be nice and use reasonable delays and concurrent requests.
|
This crawler crawls the KIT ILIAS instance.
|
||||||
- `target`: The ILIAS element to crawl. Can be:
|
|
||||||
- `desktop` if you want to crawl your personal desktop
|
ILIAS is not great at handling too many concurrent requests. To avoid
|
||||||
- `<course id>` if you want to crawl the course with the given id
|
unnecessary load, please limit `tasks` to `1`.
|
||||||
- `<url>` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page)
|
|
||||||
- `tfa_auth`: Like `auth` but only used for two-factor authentication
|
There is a spike in ILIAS usage at the beginning of lectures, so please don't
|
||||||
- `link_file_redirect_delay`: PFERD will create local HTML for external links.
|
run PFERD during those times.
|
||||||
If this property is set to a non-negative value it configures the amount of seconds after which the local HTML
|
|
||||||
file will redirect you to the link target.
|
If you're automatically running PFERD periodically (e. g. via cron or a systemd
|
||||||
- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link
|
timer), please randomize the start time or at least don't use the full hour. For
|
||||||
target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional
|
systemd timers, this can be accomplished using the `RandomizedDelaySec` option.
|
||||||
HTML page instead.
|
Also, please schedule the script to run in periods of low activity. Running the
|
||||||
- `videos`: If this is set to false, PFERD will not crawl or download any videos.
|
script once per day should be fine.
|
||||||
- `http_timeout`: The timeout for http requests
|
|
||||||
|
- `target`: The ILIAS element to crawl. (Required)
|
||||||
|
- `desktop`: Crawl your personal desktop
|
||||||
|
- `<course id>`: Crawl the course with the given id
|
||||||
|
- `<url>`: Crawl a given element by URL (preferably the permanent URL linked
|
||||||
|
at the bottom of its ILIAS page)
|
||||||
|
- `auth`: Name of auth section to use for login. (Required)
|
||||||
|
- `tfa_auth`: Name of auth section to use for two-factor authentication. Only
|
||||||
|
uses the auth section's password. (Default: Anonymous `tfa` authenticator)
|
||||||
|
- `links`: How to represent external links. (Default: `fancy`)
|
||||||
|
- `ignore`: Don't download links.
|
||||||
|
- `plaintext`: A text file containing only the URL.
|
||||||
|
- `fancy`: A HTML file looking like the ILIAS link element.
|
||||||
|
- `internet-shortcut`: An internet shortcut file (`.url` file).
|
||||||
|
- `link_redirect_delay`: Time (in seconds) until `fancy` link files will
|
||||||
|
redirect to the actual URL. Set to a negative value to disable the automatic
|
||||||
|
redirect. (Default: `-1`)
|
||||||
|
- `videos`: Whether to download videos. (Default: `no`)
|
||||||
|
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
||||||
|
`20.0`)
|
||||||
|
|
||||||
## Authenticator types
|
## Authenticator types
|
||||||
|
|
||||||
@ -161,21 +180,24 @@ via the terminal.
|
|||||||
- `username`: The username. (Optional)
|
- `username`: The username. (Optional)
|
||||||
- `password`: The password. (Optional)
|
- `password`: The password. (Optional)
|
||||||
|
|
||||||
|
### The `keyring` authenticator
|
||||||
|
|
||||||
|
This authenticator uses the system keyring to store passwords. The username can
|
||||||
|
be set directly in the config file. If the username is not specified, the user
|
||||||
|
is prompted via the terminal. If the keyring contains no entry or the entry is
|
||||||
|
incorrect, the user is prompted for a password via the terminal and the password
|
||||||
|
is stored in the keyring.
|
||||||
|
|
||||||
|
- `username`: The username. (Optional)
|
||||||
|
- `keyring_name`: The service name PFERD uses for storing credentials. (Default:
|
||||||
|
`PFERD`)
|
||||||
|
|
||||||
### The `tfa` authenticator
|
### The `tfa` authenticator
|
||||||
|
|
||||||
This authenticator prompts the user on the console for a two-factor
|
This authenticator prompts the user on the console for a two-factor
|
||||||
authentication token. The token is provided as password and it is not cached.
|
authentication token. The token is provided as password and it is not cached.
|
||||||
This authenticator does not support usernames.
|
This authenticator does not support usernames.
|
||||||
|
|
||||||
### The `keyring` authenticator
|
|
||||||
|
|
||||||
This authenticator uses the system keyring to store passwords. It expects a
|
|
||||||
username in the config and will prompt *once* for the password. After that it
|
|
||||||
receives the password from the system keyring.
|
|
||||||
|
|
||||||
- `username`: The username. (Required)
|
|
||||||
- `keyring_name`: The service name PFERD uses for storing credentials. (Optional)
|
|
||||||
|
|
||||||
## Transformation rules
|
## Transformation rules
|
||||||
|
|
||||||
Transformation rules are rules for renaming and excluding files and directories.
|
Transformation rules are rules for renaming and excluding files and directories.
|
||||||
|
@ -11,14 +11,14 @@ SUBPARSER = SUBPARSERS.add_parser(
|
|||||||
)
|
)
|
||||||
|
|
||||||
GROUP = SUBPARSER.add_argument_group(
|
GROUP = SUBPARSER.add_argument_group(
|
||||||
title="KIT ILIAS web-crawler arguments",
|
title="kit-ilias-web crawler arguments",
|
||||||
description="arguments for the 'kit-ilias-web' crawler",
|
description="arguments for the 'kit-ilias-web' crawler",
|
||||||
)
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"target",
|
"target",
|
||||||
type=str,
|
type=str,
|
||||||
metavar="TARGET",
|
metavar="TARGET",
|
||||||
help="course id, 'desktop', or ILIAS https-URL to crawl"
|
help="course id, 'desktop', or ILIAS URL to crawl"
|
||||||
)
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"output",
|
"output",
|
||||||
@ -27,14 +27,9 @@ GROUP.add_argument(
|
|||||||
help="output directory"
|
help="output directory"
|
||||||
)
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"--videos",
|
"--username", "-u",
|
||||||
action=BooleanOptionalAction,
|
|
||||||
help="crawl and download videos"
|
|
||||||
)
|
|
||||||
GROUP.add_argument(
|
|
||||||
"--username",
|
|
||||||
type=str,
|
type=str,
|
||||||
metavar="USER_NAME",
|
metavar="USERNAME",
|
||||||
help="user name for authentication"
|
help="user name for authentication"
|
||||||
)
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
@ -46,19 +41,24 @@ GROUP.add_argument(
|
|||||||
"--links",
|
"--links",
|
||||||
type=show_value_error(Links.from_string),
|
type=show_value_error(Links.from_string),
|
||||||
metavar="OPTION",
|
metavar="OPTION",
|
||||||
help="how to treat external links"
|
help="how to represent external links"
|
||||||
)
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"--link-file-redirect-delay",
|
"--link-redirect-delay",
|
||||||
type=int,
|
type=int,
|
||||||
metavar="SECONDS",
|
metavar="SECONDS",
|
||||||
help="delay before external link files redirect you to their target (-1 to disable)"
|
help="time before 'fancy' links redirect to to their target (-1 to disable)"
|
||||||
)
|
)
|
||||||
GROUP.add_argument(
|
GROUP.add_argument(
|
||||||
"--http-timeout",
|
"--videos",
|
||||||
|
action=BooleanOptionalAction,
|
||||||
|
help="crawl and download videos"
|
||||||
|
)
|
||||||
|
GROUP.add_argument(
|
||||||
|
"--http-timeout", "-t",
|
||||||
type=float,
|
type=float,
|
||||||
metavar="SECONDS",
|
metavar="SECONDS",
|
||||||
help="the timeout to use for HTTP requests"
|
help="timeout for all HTTP requests"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -66,33 +66,30 @@ def load(
|
|||||||
args: argparse.Namespace,
|
args: argparse.Namespace,
|
||||||
parser: configparser.ConfigParser,
|
parser: configparser.ConfigParser,
|
||||||
) -> None:
|
) -> None:
|
||||||
parser["crawl:kit-ilias-web"] = {}
|
parser["crawl:ilias"] = {}
|
||||||
section = parser["crawl:kit-ilias-web"]
|
section = parser["crawl:ilias"]
|
||||||
load_crawler(args, section)
|
load_crawler(args, section)
|
||||||
|
|
||||||
section["type"] = "kit-ilias-web"
|
section["type"] = "kit-ilias-web"
|
||||||
section["target"] = str(args.target)
|
section["target"] = str(args.target)
|
||||||
section["output_dir"] = str(args.output)
|
section["output_dir"] = str(args.output)
|
||||||
section["auth"] = "auth:kit-ilias-web"
|
section["auth"] = "auth:ilias"
|
||||||
if args.link_file_redirect_delay is not None:
|
|
||||||
section["link_file_redirect_delay"] = str(args.link_file_redirect_delay)
|
|
||||||
if args.links is not None:
|
if args.links is not None:
|
||||||
section["links"] = str(args.links.value)
|
section["links"] = str(args.links.value)
|
||||||
|
if args.link_redirect_delay is not None:
|
||||||
|
section["link_redirect_delay"] = str(args.link_redirect_delay)
|
||||||
if args.videos is not None:
|
if args.videos is not None:
|
||||||
section["videos"] = str(False)
|
section["videos"] = "yes" if args.videos else "no"
|
||||||
if args.http_timeout is not None:
|
if args.http_timeout is not None:
|
||||||
section["http_timeout"] = str(args.http_timeout)
|
section["http_timeout"] = str(args.http_timeout)
|
||||||
|
|
||||||
parser["auth:kit-ilias-web"] = {}
|
parser["auth:ilias"] = {}
|
||||||
auth_section = parser["auth:kit-ilias-web"]
|
auth_section = parser["auth:ilias"]
|
||||||
|
auth_section["type"] = "simple"
|
||||||
|
if args.username is not None:
|
||||||
|
auth_section["username"] = args.username
|
||||||
if args.keyring:
|
if args.keyring:
|
||||||
auth_section["type"] = "keyring"
|
auth_section["type"] = "keyring"
|
||||||
else:
|
|
||||||
auth_section["type"] = "simple"
|
|
||||||
|
|
||||||
if args.username is not None:
|
|
||||||
auth_section["username"] = str(args.username)
|
|
||||||
|
|
||||||
|
|
||||||
SUBPARSER.set_defaults(command=load)
|
SUBPARSER.set_defaults(command=load)
|
||||||
|
@ -77,10 +77,10 @@ CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group(
|
|||||||
description="arguments common to all crawlers",
|
description="arguments common to all crawlers",
|
||||||
)
|
)
|
||||||
CRAWLER_PARSER_GROUP.add_argument(
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
"--redownload",
|
"--redownload", "-r",
|
||||||
type=show_value_error(Redownload.from_string),
|
type=show_value_error(Redownload.from_string),
|
||||||
metavar="OPTION",
|
metavar="OPTION",
|
||||||
help="when to redownload a file that's already present locally"
|
help="when to download a file that's already present locally"
|
||||||
)
|
)
|
||||||
CRAWLER_PARSER_GROUP.add_argument(
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
"--on-conflict",
|
"--on-conflict",
|
||||||
@ -89,30 +89,35 @@ CRAWLER_PARSER_GROUP.add_argument(
|
|||||||
help="what to do when local and remote files or directories differ"
|
help="what to do when local and remote files or directories differ"
|
||||||
)
|
)
|
||||||
CRAWLER_PARSER_GROUP.add_argument(
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
"--transform", "-t",
|
"--transform", "-T",
|
||||||
action="append",
|
action="append",
|
||||||
type=str,
|
type=str,
|
||||||
metavar="RULE",
|
metavar="RULE",
|
||||||
help="add a single transformation rule. Can be specified multiple times"
|
help="add a single transformation rule. Can be specified multiple times"
|
||||||
)
|
)
|
||||||
CRAWLER_PARSER_GROUP.add_argument(
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
"--max-concurrent-tasks",
|
"--tasks", "-n",
|
||||||
type=int,
|
type=int,
|
||||||
metavar="N",
|
metavar="N",
|
||||||
help="maximum number of concurrent tasks (crawling, downloading)"
|
help="maximum number of concurrent tasks (crawling, downloading)"
|
||||||
)
|
)
|
||||||
CRAWLER_PARSER_GROUP.add_argument(
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
"--max-concurrent-downloads",
|
"--downloads", "-N",
|
||||||
type=int,
|
type=int,
|
||||||
metavar="N",
|
metavar="N",
|
||||||
help="maximum number of tasks that may download data at the same time"
|
help="maximum number of tasks that may download data at the same time"
|
||||||
)
|
)
|
||||||
CRAWLER_PARSER_GROUP.add_argument(
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
"--delay-between-tasks",
|
"--task-delay", "-d",
|
||||||
type=float,
|
type=float,
|
||||||
metavar="SECONDS",
|
metavar="SECONDS",
|
||||||
help="time the crawler should wait between subsequent tasks"
|
help="time the crawler should wait between subsequent tasks"
|
||||||
)
|
)
|
||||||
|
CRAWLER_PARSER_GROUP.add_argument(
|
||||||
|
"--windows-paths",
|
||||||
|
action=BooleanOptionalAction,
|
||||||
|
help="whether to repair invalid paths on windows"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_crawler(
|
def load_crawler(
|
||||||
@ -125,12 +130,14 @@ def load_crawler(
|
|||||||
section["on_conflict"] = args.on_conflict.value
|
section["on_conflict"] = args.on_conflict.value
|
||||||
if args.transform is not None:
|
if args.transform is not None:
|
||||||
section["transform"] = "\n" + "\n".join(args.transform)
|
section["transform"] = "\n" + "\n".join(args.transform)
|
||||||
if args.max_concurrent_tasks is not None:
|
if args.tasks is not None:
|
||||||
section["max_concurrent_tasks"] = str(args.max_concurrent_tasks)
|
section["tasks"] = str(args.tasks)
|
||||||
if args.max_concurrent_downloads is not None:
|
if args.downloads is not None:
|
||||||
section["max_concurrent_downloads"] = str(args.max_concurrent_downloads)
|
section["downloads"] = str(args.downloads)
|
||||||
if args.delay_between_tasks is not None:
|
if args.task_delay is not None:
|
||||||
section["delay_between_tasks"] = str(args.delay_between_tasks)
|
section["task_delay"] = str(args.task_delay)
|
||||||
|
if args.windows_paths is not None:
|
||||||
|
section["windows_paths"] = "yes" if args.windows_paths else "no"
|
||||||
|
|
||||||
|
|
||||||
PARSER = argparse.ArgumentParser()
|
PARSER = argparse.ArgumentParser()
|
||||||
@ -200,6 +207,10 @@ def load_default_section(
|
|||||||
section["working_dir"] = str(args.working_dir)
|
section["working_dir"] = str(args.working_dir)
|
||||||
if args.explain is not None:
|
if args.explain is not None:
|
||||||
section["explain"] = "yes" if args.explain else "no"
|
section["explain"] = "yes" if args.explain else "no"
|
||||||
|
if args.status is not None:
|
||||||
|
section["status"] = "yes" if args.status else "no"
|
||||||
|
if args.report is not None:
|
||||||
|
section["report"] = "yes" if args.report else "no"
|
||||||
if args.share_cookies is not None:
|
if args.share_cookies is not None:
|
||||||
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
||||||
|
|
||||||
|
@ -169,33 +169,33 @@ class CrawlerSection(Section):
|
|||||||
def transform(self) -> str:
|
def transform(self) -> str:
|
||||||
return self.s.get("transform", "")
|
return self.s.get("transform", "")
|
||||||
|
|
||||||
def max_concurrent_tasks(self) -> int:
|
def tasks(self) -> int:
|
||||||
value = self.s.getint("max_concurrent_tasks", fallback=1)
|
value = self.s.getint("tasks", fallback=1)
|
||||||
if value <= 0:
|
if value <= 0:
|
||||||
self.invalid_value("max_concurrent_tasks", value,
|
self.invalid_value("tasks", value, "Must be greater than 0")
|
||||||
"Must be greater than 0")
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def max_concurrent_downloads(self) -> int:
|
def downloads(self) -> int:
|
||||||
tasks = self.max_concurrent_tasks()
|
tasks = self.tasks()
|
||||||
value = self.s.getint("max_concurrent_downloads", fallback=None)
|
value = self.s.getint("downloads", fallback=None)
|
||||||
if value is None:
|
if value is None:
|
||||||
return tasks
|
return tasks
|
||||||
if value <= 0:
|
if value <= 0:
|
||||||
self.invalid_value("max_concurrent_downloads", value,
|
self.invalid_value("downloads", value, "Must be greater than 0")
|
||||||
"Must be greater than 0")
|
|
||||||
if value > tasks:
|
if value > tasks:
|
||||||
self.invalid_value("max_concurrent_downloads", value,
|
self.invalid_value("downloads", value, "Must not be greater than tasks")
|
||||||
"Must not be greater than max_concurrent_tasks")
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def delay_between_tasks(self) -> float:
|
def task_delay(self) -> float:
|
||||||
value = self.s.getfloat("delay_between_tasks", fallback=0.0)
|
value = self.s.getfloat("task_delay", fallback=0.0)
|
||||||
if value < 0:
|
if value < 0:
|
||||||
self.invalid_value("delay_between_tasks", value,
|
self.invalid_value("task_delay", value, "Must not be negative")
|
||||||
"Must not be negative")
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def windows_paths(self) -> bool:
|
||||||
|
on_windows = os.name == "nt"
|
||||||
|
return self.s.getboolean("windows_paths", fallback=on_windows)
|
||||||
|
|
||||||
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
||||||
value = self.s.get("auth")
|
value = self.s.get("auth")
|
||||||
if value is None:
|
if value is None:
|
||||||
@ -205,10 +205,6 @@ class CrawlerSection(Section):
|
|||||||
self.invalid_value("auth", value, "No such auth section exists")
|
self.invalid_value("auth", value, "No such auth section exists")
|
||||||
return auth
|
return auth
|
||||||
|
|
||||||
def windows_paths(self) -> bool:
|
|
||||||
on_windows = os.name == "nt"
|
|
||||||
return self.s.getboolean("windows_paths", fallback=on_windows)
|
|
||||||
|
|
||||||
|
|
||||||
class Crawler(ABC):
|
class Crawler(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -230,9 +226,9 @@ class Crawler(ABC):
|
|||||||
self.error_free = True
|
self.error_free = True
|
||||||
|
|
||||||
self._limiter = Limiter(
|
self._limiter = Limiter(
|
||||||
task_limit=section.max_concurrent_tasks(),
|
task_limit=section.tasks(),
|
||||||
download_limit=section.max_concurrent_downloads(),
|
download_limit=section.downloads(),
|
||||||
task_delay=section.delay_between_tasks(),
|
task_delay=section.task_delay(),
|
||||||
)
|
)
|
||||||
|
|
||||||
self._deduplicator = Deduplicator(section.windows_paths())
|
self._deduplicator = Deduplicator(section.windows_paths())
|
||||||
|
@ -40,18 +40,14 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
|
|||||||
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
|
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
|
||||||
|
|
||||||
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||||
value = self.s.get("tfa_auth")
|
value: Optional[str] = self.s.get("tfa_auth")
|
||||||
if not value:
|
if value is None:
|
||||||
return None
|
return None
|
||||||
|
auth = authenticators.get(value)
|
||||||
auth = authenticators.get(f"auth:{value}")
|
|
||||||
if auth is None:
|
if auth is None:
|
||||||
self.invalid_value("auth", value, "No such auth section exists")
|
self.invalid_value("tfa_auth", value, "No such auth section exists")
|
||||||
return auth
|
return auth
|
||||||
|
|
||||||
def link_file_redirect_delay(self) -> int:
|
|
||||||
return self.s.getint("link_file_redirect_delay", fallback=-1)
|
|
||||||
|
|
||||||
def links(self) -> Links:
|
def links(self) -> Links:
|
||||||
type_str: Optional[str] = self.s.get("links")
|
type_str: Optional[str] = self.s.get("links")
|
||||||
|
|
||||||
@ -63,6 +59,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
self.invalid_value("links", type_str, str(e).capitalize())
|
self.invalid_value("links", type_str, str(e).capitalize())
|
||||||
|
|
||||||
|
def link_redirect_delay(self) -> int:
|
||||||
|
return self.s.getint("link_redirect_delay", fallback=-1)
|
||||||
|
|
||||||
def videos(self) -> bool:
|
def videos(self) -> bool:
|
||||||
return self.s.getboolean("videos", fallback=False)
|
return self.s.getboolean("videos", fallback=False)
|
||||||
|
|
||||||
@ -173,7 +172,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
self._base_url = "https://ilias.studium.kit.edu"
|
self._base_url = "https://ilias.studium.kit.edu"
|
||||||
|
|
||||||
self._target = section.target()
|
self._target = section.target()
|
||||||
self._link_file_redirect_delay = section.link_file_redirect_delay()
|
self._link_file_redirect_delay = section.link_redirect_delay()
|
||||||
self._links = section.links()
|
self._links = section.links()
|
||||||
self._videos = section.videos()
|
self._videos = section.videos()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user