2021-05-23 11:57:59 +02:00
|
|
|
import argparse
|
|
|
|
import configparser
|
|
|
|
from pathlib import Path
|
2021-05-23 22:26:41 +02:00
|
|
|
from typing import Any, List, Optional, Sequence, Union
|
2021-05-23 11:57:59 +02:00
|
|
|
|
|
|
|
from ..output_dir import OnConflict, Redownload
|
2021-05-23 21:40:48 +02:00
|
|
|
from ..version import NAME, VERSION
|
2021-05-23 11:57:59 +02:00
|
|
|
|
2021-05-23 22:26:41 +02:00
|
|
|
|
|
|
|
# TODO Replace with argparse version when updating to 3.9?
|
|
|
|
class BooleanOptionalAction(argparse.Action):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
option_strings: List[str],
|
|
|
|
dest: Any,
|
|
|
|
default: Any = None,
|
|
|
|
type: Any = None,
|
|
|
|
choices: Any = None,
|
|
|
|
required: Any = False,
|
|
|
|
help: Any = None,
|
|
|
|
metavar: Any = None,
|
|
|
|
):
|
|
|
|
if len(option_strings) != 1:
|
|
|
|
raise ValueError("There must be exactly one option string")
|
|
|
|
[self.name] = option_strings
|
|
|
|
if not self.name.startswith("--"):
|
|
|
|
raise ValueError(f"{self.name!r} doesn't start with '--'")
|
|
|
|
if self.name.startswith("--no-"):
|
|
|
|
raise ValueError(f"{self.name!r} starts with '--no-'")
|
|
|
|
|
|
|
|
options = [self.name, "--no-" + self.name[2:]]
|
|
|
|
|
|
|
|
super().__init__(
|
|
|
|
options,
|
|
|
|
dest,
|
|
|
|
nargs=0,
|
|
|
|
default=default,
|
|
|
|
type=type,
|
|
|
|
choices=choices,
|
|
|
|
required=required,
|
|
|
|
help=help,
|
|
|
|
metavar=metavar,
|
|
|
|
)
|
|
|
|
|
|
|
|
def __call__(
|
|
|
|
self,
|
|
|
|
parser: argparse.ArgumentParser,
|
|
|
|
namespace: argparse.Namespace,
|
|
|
|
values: Union[str, Sequence[Any], None],
|
|
|
|
option_string: Optional[str] = None,
|
|
|
|
) -> None:
|
|
|
|
if option_string and option_string in self.option_strings:
|
|
|
|
value = not option_string.startswith("--no-")
|
|
|
|
setattr(namespace, self.dest, value)
|
|
|
|
|
|
|
|
def format_usage(self) -> str:
|
|
|
|
return "--[no-]" + self.name[2:]
|
|
|
|
|
|
|
|
|
2021-05-23 11:57:59 +02:00
|
|
|
CRAWLER_PARSER = argparse.ArgumentParser(add_help=False)
|
|
|
|
CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group(
|
|
|
|
title="general crawler arguments",
|
|
|
|
description="arguments common to all crawlers",
|
|
|
|
)
|
|
|
|
CRAWLER_PARSER_GROUP.add_argument(
|
|
|
|
"--redownload",
|
|
|
|
type=Redownload.from_string,
|
|
|
|
metavar="OPTION",
|
|
|
|
help="when to redownload a file that's already present locally"
|
|
|
|
)
|
|
|
|
CRAWLER_PARSER_GROUP.add_argument(
|
|
|
|
"--on-conflict",
|
|
|
|
type=OnConflict.from_string,
|
|
|
|
metavar="OPTION",
|
|
|
|
help="what to do when local and remote files or directories differ"
|
|
|
|
)
|
|
|
|
CRAWLER_PARSER_GROUP.add_argument(
|
|
|
|
"--transform", "-t",
|
|
|
|
action="append",
|
|
|
|
type=str,
|
|
|
|
metavar="RULE",
|
|
|
|
help="add a single transformation rule. Can be specified multiple times"
|
|
|
|
)
|
|
|
|
CRAWLER_PARSER_GROUP.add_argument(
|
|
|
|
"--max-concurrent-tasks",
|
|
|
|
type=int,
|
|
|
|
metavar="N",
|
|
|
|
help="maximum number of concurrent tasks (crawling, downloading)"
|
|
|
|
)
|
|
|
|
CRAWLER_PARSER_GROUP.add_argument(
|
|
|
|
"--max-concurrent-downloads",
|
|
|
|
type=int,
|
|
|
|
metavar="N",
|
|
|
|
help="maximum number of tasks that may download data at the same time"
|
|
|
|
)
|
|
|
|
CRAWLER_PARSER_GROUP.add_argument(
|
|
|
|
"--delay-between-tasks",
|
|
|
|
type=float,
|
|
|
|
metavar="SECONDS",
|
|
|
|
help="time the crawler should wait between subsequent tasks"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def load_crawler(
|
|
|
|
args: argparse.Namespace,
|
|
|
|
section: configparser.SectionProxy,
|
|
|
|
) -> None:
|
|
|
|
if args.redownload is not None:
|
|
|
|
section["redownload"] = args.redownload.value
|
|
|
|
if args.on_conflict is not None:
|
|
|
|
section["on_conflict"] = args.on_conflict.value
|
|
|
|
if args.transform is not None:
|
|
|
|
section["transform"] = "\n" + "\n".join(args.transform)
|
|
|
|
if args.max_concurrent_tasks is not None:
|
|
|
|
section["max_concurrent_tasks"] = str(args.max_concurrent_tasks)
|
|
|
|
if args.max_concurrent_downloads is not None:
|
|
|
|
section["max_concurrent_downloads"] = str(args.max_concurrent_downloads)
|
|
|
|
if args.delay_between_tasks is not None:
|
|
|
|
section["delay_between_tasks"] = str(args.delay_between_tasks)
|
|
|
|
|
|
|
|
|
|
|
|
PARSER = argparse.ArgumentParser()
|
|
|
|
PARSER.set_defaults(command=None)
|
|
|
|
PARSER.add_argument(
|
|
|
|
"--version",
|
2021-05-23 21:40:48 +02:00
|
|
|
action="version",
|
|
|
|
version=f"{NAME} {VERSION}",
|
2021-05-23 11:57:59 +02:00
|
|
|
)
|
|
|
|
PARSER.add_argument(
|
|
|
|
"--config", "-c",
|
|
|
|
type=Path,
|
|
|
|
metavar="PATH",
|
|
|
|
help="custom config file"
|
|
|
|
)
|
|
|
|
PARSER.add_argument(
|
|
|
|
"--dump-config",
|
|
|
|
nargs="?",
|
|
|
|
const=True,
|
|
|
|
metavar="PATH",
|
|
|
|
help="dump current configuration to a file and exit."
|
|
|
|
" Uses default config file path if no path is specified"
|
|
|
|
)
|
|
|
|
PARSER.add_argument(
|
2021-05-23 19:05:56 +02:00
|
|
|
"--crawler", "-C",
|
2021-05-23 11:57:59 +02:00
|
|
|
action="append",
|
|
|
|
type=str,
|
|
|
|
metavar="NAME",
|
|
|
|
help="only execute a single crawler."
|
|
|
|
" Can be specified multiple times to execute multiple crawlers"
|
|
|
|
)
|
|
|
|
PARSER.add_argument(
|
|
|
|
"--working-dir",
|
|
|
|
type=Path,
|
|
|
|
metavar="PATH",
|
|
|
|
help="custom working directory"
|
|
|
|
)
|
|
|
|
PARSER.add_argument(
|
2021-05-23 22:26:41 +02:00
|
|
|
"--explain",
|
|
|
|
action=BooleanOptionalAction,
|
2021-05-23 11:57:59 +02:00
|
|
|
help="log and explain in detail what PFERD is doing"
|
|
|
|
)
|
2021-05-23 22:41:59 +02:00
|
|
|
PARSER.add_argument(
|
|
|
|
"--status",
|
|
|
|
action=BooleanOptionalAction,
|
|
|
|
help="print status updates while PFERD is crawling"
|
|
|
|
)
|
|
|
|
PARSER.add_argument(
|
|
|
|
"--report",
|
|
|
|
action=BooleanOptionalAction,
|
|
|
|
help="print a report of all local changes before exiting"
|
|
|
|
)
|
2021-05-23 11:57:59 +02:00
|
|
|
|
|
|
|
|
|
|
|
def load_default_section(
|
|
|
|
args: argparse.Namespace,
|
|
|
|
parser: configparser.ConfigParser,
|
|
|
|
) -> None:
|
|
|
|
section = parser[parser.default_section]
|
|
|
|
|
|
|
|
if args.working_dir is not None:
|
|
|
|
section["working_dir"] = str(args.working_dir)
|
|
|
|
if args.explain is not None:
|
|
|
|
section["explain"] = "true" if args.explain else "false"
|
|
|
|
|
|
|
|
|
|
|
|
SUBPARSERS = PARSER.add_subparsers(title="crawlers")
|