From 61430c8739b5789596a7b8da085eca3b37b3ec83 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 14:12:19 +0200 Subject: [PATCH] Overhaul config and CLI option names --- CONFIG.md | 138 ++++++++++++--------- PFERD/cli/command_kit_ilias_web.py | 53 ++++---- PFERD/cli/parser.py | 35 ++++-- PFERD/crawl/crawler.py | 40 +++--- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 17 ++- 5 files changed, 154 insertions(+), 129 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 982f4fc..2f2dbbe 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -1,10 +1,11 @@ # Config file format A config file consists of sections. A section begins with a `[section]` header, -which is followed by a list of `key = value` or `key: value` pairs. Comments -must be on their own line and start with `#` or `;`. Multiline values must be -indented beyond their key. For more details and some examples on the format, see -the [configparser documentation][1] ([basic interpolation][2] is enabled). +which is followed by a list of `key = value` pairs. Comments must be on their +own line and start with `#`. Multiline values must be indented beyond their key. +Boolean values can be `yes` or `no`. For more details and some examples on the +format, see the [configparser documentation][1] ([basic interpolation][2] is +enabled). [1]: "Supported INI File Structure" [2]: "BasicInterpolation" @@ -15,21 +16,19 @@ This section contains global configuration values. It can also be used to set default values for the other sections. - `working_dir`: The directory PFERD operates in. Set to an absolute path to - make PFERD operate the same regardless of where it is executed. All other + make PFERD operate the same regardless of where it is executed from. All other paths in the config file are interpreted relative to this path. If this path is relative, it is interpreted relative to the script's working dir. `~` is expanded to the current user's home directory. (Default: `.`) - `explain`: Whether PFERD should log and explain its actions and decisions in detail. (Default: `no`) -- `status`: Whether PFERD should print status updates while crawling. (Default: - `yes`) +- `status`: Whether PFERD should print status updates (like `Crawled ...`, + `Added ...`) while running a crawler. (Default: `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) -- `share_cookies`: Whether crawlers should share cookies where applicable. By - default, crawlers are isolated and don't interact with each other. This - includes their cookies. However, in situations where multiple crawlers crawl - the same website using the same account, sharing cookies between crawlers can - make sense. (Default: `yes`) +- `share_cookies`: Whether crawlers should share cookies where applicable. For + example, some crawlers share cookies if they crawl the same website using the + same account. (Default: `yes`) ## The `crawl:*` sections @@ -42,17 +41,17 @@ courses or lecture websites. Each crawl section represents an instance of a specific type of crawler. The `type` option is used to specify the crawler type. The crawler's name is usually -used as the name for the output directory. New crawlers can be created simply by -adding a new crawl section to the config file. +used as the output directory. New crawlers can be created simply by adding a new +crawl section to the config file. Depending on a crawler's type, it may have different options. For more details, -see the type's documentation below. The following options are common to all -crawlers: +see the type's [documentation](#crawler-types) below. The following options are +common to all crawlers: -- `type`: The types are specified in [this section](#crawler-types). +- `type`: The available types are specified in [this section](#crawler-types). - `output_dir`: The directory the crawler synchronizes files to. A crawler will never place any files outside of this directory. (Default: the crawler's name) -- `redownload`: When to download again a file that is already present locally. +- `redownload`: When to download a file that is already present locally. (Default: `never-smart`) - `never`: If a file is present locally, it is not downloaded again. - `never-smart`: Like `never`, but PFERD tries to detect if an already @@ -62,8 +61,8 @@ crawlers: - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary downloads via some (unreliable) heuristics. - `on_conflict`: What to do when the local and remote versions of a file or - directory differ. Includes the cases where a file is replaced by a directory - or a directory by a file. (Default: `prompt`) + directory differ, including when a file is replaced by a directory or a + directory by a file. (Default: `prompt`) - `prompt`: Always ask the user before overwriting or deleting local files and directories. - `local-first`: Always keep the local file or directory. Equivalent to @@ -75,14 +74,13 @@ crawlers: remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) -- `max_concurrent_tasks`: The maximum number of concurrent tasks (such as - crawling or downloading). (Default: 1) -- `max_concurrent_downloads`: How many of those tasks can be download tasks at - the same time. Must not be greater than `max_concurrent_tasks`. When not set, - this is the same as `max_concurrent_tasks`. (Optional) -- `delay_between_tasks`: Time (in seconds) that the crawler should wait between +- `tasks`: The maximum number of concurrent tasks (such as crawling or + downloading). (Default: `1`) +- `downloads`: How many of those tasks can be download tasks at the same time. + Must not be greater than `tasks`. (Default: Same as `tasks`) +- `task_delay`: Time (in seconds) that the crawler should wait between subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary - load for the crawl target. (Default: 0.0) + load for the crawl target. (Default: `0.0`) - `windows_paths`: Whether PFERD should find alternative names for paths that are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) @@ -101,6 +99,8 @@ password = bar [crawl:something] type = some-complex-crawler auth = auth:example +on_conflict = no-delete +tasks = 3 ``` ## The `auth:*` sections @@ -109,12 +109,12 @@ Sections whose names start with `auth:` are used to configure authenticators. An authenticator provides a username and a password to one or more crawlers. Authenticators work similar to crawlers: A section represents an authenticator -instance, whose name is the rest of the section name. The type is specified by +instance whose name is the rest of the section name. The type is specified by the `type` option. Depending on an authenticator's type, it may have different options. For more -details, see the type's documentation below. The only option common to all -authenticators is `type`: +details, see the type's [documentation](#authenticator-types) below. The only +option common to all authenticators is `type`: - `type`: The types are specified in [this section](#authenticator-types). @@ -127,28 +127,47 @@ testing different setups. The various delay options are meant to make the crawler simulate a slower, network-based crawler. - `target`: Path to the local directory to crawl. (Required) -- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl - requests. (Default: 0.0) -- `download_delay`: Maximum artificial delay (in seconds) to simulate for - download requests. (Default: 0.0) +- `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests. + (Default: `0.0`) +- `download_delay`: Artificial delay (in seconds) to simulate for download + requests. (Default: `0.0`) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) -### The `kit-ilias` crawler +### The `kit-ilias-web` crawler -This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor SCC-Server, so you should be nice and use reasonable delays and concurrent requests. -- `target`: The ILIAS element to crawl. Can be: - - `desktop` if you want to crawl your personal desktop - - `` if you want to crawl the course with the given id - - `` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page) -- `tfa_auth`: Like `auth` but only used for two-factor authentication -- `link_file_redirect_delay`: PFERD will create local HTML for external links. - If this property is set to a non-negative value it configures the amount of seconds after which the local HTML - file will redirect you to the link target. -- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link - target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional - HTML page instead. -- `videos`: If this is set to false, PFERD will not crawl or download any videos. -- `http_timeout`: The timeout for http requests +This crawler crawls the KIT ILIAS instance. + +ILIAS is not great at handling too many concurrent requests. To avoid +unnecessary load, please limit `tasks` to `1`. + +There is a spike in ILIAS usage at the beginning of lectures, so please don't +run PFERD during those times. + +If you're automatically running PFERD periodically (e. g. via cron or a systemd +timer), please randomize the start time or at least don't use the full hour. For +systemd timers, this can be accomplished using the `RandomizedDelaySec` option. +Also, please schedule the script to run in periods of low activity. Running the +script once per day should be fine. + +- `target`: The ILIAS element to crawl. (Required) + - `desktop`: Crawl your personal desktop + - ``: Crawl the course with the given id + - ``: Crawl a given element by URL (preferably the permanent URL linked + at the bottom of its ILIAS page) +- `auth`: Name of auth section to use for login. (Required) +- `tfa_auth`: Name of auth section to use for two-factor authentication. Only + uses the auth section's password. (Default: Anonymous `tfa` authenticator) +- `links`: How to represent external links. (Default: `fancy`) + - `ignore`: Don't download links. + - `plaintext`: A text file containing only the URL. + - `fancy`: A HTML file looking like the ILIAS link element. + - `internet-shortcut`: An internet shortcut file (`.url` file). +- `link_redirect_delay`: Time (in seconds) until `fancy` link files will + redirect to the actual URL. Set to a negative value to disable the automatic + redirect. (Default: `-1`) +- `videos`: Whether to download videos. (Default: `no`) +- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: + `20.0`) ## Authenticator types @@ -161,21 +180,24 @@ via the terminal. - `username`: The username. (Optional) - `password`: The password. (Optional) +### The `keyring` authenticator + +This authenticator uses the system keyring to store passwords. The username can +be set directly in the config file. If the username is not specified, the user +is prompted via the terminal. If the keyring contains no entry or the entry is +incorrect, the user is prompted for a password via the terminal and the password +is stored in the keyring. + +- `username`: The username. (Optional) +- `keyring_name`: The service name PFERD uses for storing credentials. (Default: + `PFERD`) + ### The `tfa` authenticator This authenticator prompts the user on the console for a two-factor authentication token. The token is provided as password and it is not cached. This authenticator does not support usernames. -### The `keyring` authenticator - -This authenticator uses the system keyring to store passwords. It expects a -username in the config and will prompt *once* for the password. After that it -receives the password from the system keyring. - -- `username`: The username. (Required) -- `keyring_name`: The service name PFERD uses for storing credentials. (Optional) - ## Transformation rules Transformation rules are rules for renaming and excluding files and directories. diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 8323c5c..ccb7134 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -11,14 +11,14 @@ SUBPARSER = SUBPARSERS.add_parser( ) GROUP = SUBPARSER.add_argument_group( - title="KIT ILIAS web-crawler arguments", + title="kit-ilias-web crawler arguments", description="arguments for the 'kit-ilias-web' crawler", ) GROUP.add_argument( "target", type=str, metavar="TARGET", - help="course id, 'desktop', or ILIAS https-URL to crawl" + help="course id, 'desktop', or ILIAS URL to crawl" ) GROUP.add_argument( "output", @@ -27,14 +27,9 @@ GROUP.add_argument( help="output directory" ) GROUP.add_argument( - "--videos", - action=BooleanOptionalAction, - help="crawl and download videos" -) -GROUP.add_argument( - "--username", + "--username", "-u", type=str, - metavar="USER_NAME", + metavar="USERNAME", help="user name for authentication" ) GROUP.add_argument( @@ -46,19 +41,24 @@ GROUP.add_argument( "--links", type=show_value_error(Links.from_string), metavar="OPTION", - help="how to treat external links" + help="how to represent external links" ) GROUP.add_argument( - "--link-file-redirect-delay", + "--link-redirect-delay", type=int, metavar="SECONDS", - help="delay before external link files redirect you to their target (-1 to disable)" + help="time before 'fancy' links redirect to to their target (-1 to disable)" ) GROUP.add_argument( - "--http-timeout", + "--videos", + action=BooleanOptionalAction, + help="crawl and download videos" +) +GROUP.add_argument( + "--http-timeout", "-t", type=float, metavar="SECONDS", - help="the timeout to use for HTTP requests" + help="timeout for all HTTP requests" ) @@ -66,33 +66,30 @@ def load( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: - parser["crawl:kit-ilias-web"] = {} - section = parser["crawl:kit-ilias-web"] + parser["crawl:ilias"] = {} + section = parser["crawl:ilias"] load_crawler(args, section) section["type"] = "kit-ilias-web" section["target"] = str(args.target) section["output_dir"] = str(args.output) - section["auth"] = "auth:kit-ilias-web" - if args.link_file_redirect_delay is not None: - section["link_file_redirect_delay"] = str(args.link_file_redirect_delay) + section["auth"] = "auth:ilias" if args.links is not None: section["links"] = str(args.links.value) + if args.link_redirect_delay is not None: + section["link_redirect_delay"] = str(args.link_redirect_delay) if args.videos is not None: - section["videos"] = str(False) + section["videos"] = "yes" if args.videos else "no" if args.http_timeout is not None: section["http_timeout"] = str(args.http_timeout) - parser["auth:kit-ilias-web"] = {} - auth_section = parser["auth:kit-ilias-web"] - + parser["auth:ilias"] = {} + auth_section = parser["auth:ilias"] + auth_section["type"] = "simple" + if args.username is not None: + auth_section["username"] = args.username if args.keyring: auth_section["type"] = "keyring" - else: - auth_section["type"] = "simple" - - if args.username is not None: - auth_section["username"] = str(args.username) SUBPARSER.set_defaults(command=load) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index f26390c..4e3b425 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -77,10 +77,10 @@ CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( description="arguments common to all crawlers", ) CRAWLER_PARSER_GROUP.add_argument( - "--redownload", + "--redownload", "-r", type=show_value_error(Redownload.from_string), metavar="OPTION", - help="when to redownload a file that's already present locally" + help="when to download a file that's already present locally" ) CRAWLER_PARSER_GROUP.add_argument( "--on-conflict", @@ -89,30 +89,35 @@ CRAWLER_PARSER_GROUP.add_argument( help="what to do when local and remote files or directories differ" ) CRAWLER_PARSER_GROUP.add_argument( - "--transform", "-t", + "--transform", "-T", action="append", type=str, metavar="RULE", help="add a single transformation rule. Can be specified multiple times" ) CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-tasks", + "--tasks", "-n", type=int, metavar="N", help="maximum number of concurrent tasks (crawling, downloading)" ) CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-downloads", + "--downloads", "-N", type=int, metavar="N", help="maximum number of tasks that may download data at the same time" ) CRAWLER_PARSER_GROUP.add_argument( - "--delay-between-tasks", + "--task-delay", "-d", type=float, metavar="SECONDS", help="time the crawler should wait between subsequent tasks" ) +CRAWLER_PARSER_GROUP.add_argument( + "--windows-paths", + action=BooleanOptionalAction, + help="whether to repair invalid paths on windows" +) def load_crawler( @@ -125,12 +130,14 @@ def load_crawler( section["on_conflict"] = args.on_conflict.value if args.transform is not None: section["transform"] = "\n" + "\n".join(args.transform) - if args.max_concurrent_tasks is not None: - section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) - if args.max_concurrent_downloads is not None: - section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) - if args.delay_between_tasks is not None: - section["delay_between_tasks"] = str(args.delay_between_tasks) + if args.tasks is not None: + section["tasks"] = str(args.tasks) + if args.downloads is not None: + section["downloads"] = str(args.downloads) + if args.task_delay is not None: + section["task_delay"] = str(args.task_delay) + if args.windows_paths is not None: + section["windows_paths"] = "yes" if args.windows_paths else "no" PARSER = argparse.ArgumentParser() @@ -200,6 +207,10 @@ def load_default_section( section["working_dir"] = str(args.working_dir) if args.explain is not None: section["explain"] = "yes" if args.explain else "no" + if args.status is not None: + section["status"] = "yes" if args.status else "no" + if args.report is not None: + section["report"] = "yes" if args.report else "no" if args.share_cookies is not None: section["share_cookies"] = "yes" if args.share_cookies else "no" diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 420d088..321daa2 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -169,33 +169,33 @@ class CrawlerSection(Section): def transform(self) -> str: return self.s.get("transform", "") - def max_concurrent_tasks(self) -> int: - value = self.s.getint("max_concurrent_tasks", fallback=1) + def tasks(self) -> int: + value = self.s.getint("tasks", fallback=1) if value <= 0: - self.invalid_value("max_concurrent_tasks", value, - "Must be greater than 0") + self.invalid_value("tasks", value, "Must be greater than 0") return value - def max_concurrent_downloads(self) -> int: - tasks = self.max_concurrent_tasks() - value = self.s.getint("max_concurrent_downloads", fallback=None) + def downloads(self) -> int: + tasks = self.tasks() + value = self.s.getint("downloads", fallback=None) if value is None: return tasks if value <= 0: - self.invalid_value("max_concurrent_downloads", value, - "Must be greater than 0") + self.invalid_value("downloads", value, "Must be greater than 0") if value > tasks: - self.invalid_value("max_concurrent_downloads", value, - "Must not be greater than max_concurrent_tasks") + self.invalid_value("downloads", value, "Must not be greater than tasks") return value - def delay_between_tasks(self) -> float: - value = self.s.getfloat("delay_between_tasks", fallback=0.0) + def task_delay(self) -> float: + value = self.s.getfloat("task_delay", fallback=0.0) if value < 0: - self.invalid_value("delay_between_tasks", value, - "Must not be negative") + self.invalid_value("task_delay", value, "Must not be negative") return value + def windows_paths(self) -> bool: + on_windows = os.name == "nt" + return self.s.getboolean("windows_paths", fallback=on_windows) + def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: value = self.s.get("auth") if value is None: @@ -205,10 +205,6 @@ class CrawlerSection(Section): self.invalid_value("auth", value, "No such auth section exists") return auth - def windows_paths(self) -> bool: - on_windows = os.name == "nt" - return self.s.getboolean("windows_paths", fallback=on_windows) - class Crawler(ABC): def __init__( @@ -230,9 +226,9 @@ class Crawler(ABC): self.error_free = True self._limiter = Limiter( - task_limit=section.max_concurrent_tasks(), - download_limit=section.max_concurrent_downloads(), - task_delay=section.delay_between_tasks(), + task_limit=section.tasks(), + download_limit=section.downloads(), + task_delay=section.task_delay(), ) self._deduplicator = Deduplicator(section.windows_paths()) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index daafc12..40db52c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -40,18 +40,14 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): self.invalid_value("target", target, "Should be ") def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: - value = self.s.get("tfa_auth") - if not value: + value: Optional[str] = self.s.get("tfa_auth") + if value is None: return None - - auth = authenticators.get(f"auth:{value}") + auth = authenticators.get(value) if auth is None: - self.invalid_value("auth", value, "No such auth section exists") + self.invalid_value("tfa_auth", value, "No such auth section exists") return auth - def link_file_redirect_delay(self) -> int: - return self.s.getint("link_file_redirect_delay", fallback=-1) - def links(self) -> Links: type_str: Optional[str] = self.s.get("links") @@ -63,6 +59,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): except ValueError as e: self.invalid_value("links", type_str, str(e).capitalize()) + def link_redirect_delay(self) -> int: + return self.s.getint("link_redirect_delay", fallback=-1) + def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) @@ -173,7 +172,7 @@ class KitIliasWebCrawler(HttpCrawler): self._base_url = "https://ilias.studium.kit.edu" self._target = section.target() - self._link_file_redirect_delay = section.link_file_redirect_delay() + self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos()