import argparse import asyncio import configparser from pathlib import Path from .config import Config, ConfigDumpException, ConfigLoadException from .logging import log from .output_dir import OnConflict, Redownload from .pferd import Pferd from .version import NAME, VERSION GENERAL_PARSER = argparse.ArgumentParser(add_help=False) GENERAL_PARSER.add_argument( "--version", action="store_true", help="print version and exit" ) GENERAL_PARSER.add_argument( "--config", "-c", type=Path, metavar="PATH", help="custom config file" ) GENERAL_PARSER.add_argument( "--dump-config", nargs="?", const=True, metavar="PATH", help="dump current configuration to a file and exit." " Uses default config file path if no path is specified" ) GENERAL_PARSER.add_argument( "--crawler", action="append", type=str, metavar="NAME", help="only execute a single crawler." " Can be specified multiple times to execute multiple crawlers" ) GENERAL_PARSER.add_argument( "--working-dir", type=Path, metavar="PATH", help="custom working directory" ) GENERAL_PARSER.add_argument( "--explain", "-e", # TODO Use argparse.BooleanOptionalAction after updating to 3.9 action="store_const", const=True, help="log and explain in detail what PFERD is doing" ) def load_general( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: section = parser[parser.default_section] if args.working_dir is not None: section["working_dir"] = str(args.working_dir) if args.explain is not None: section["explain"] = "true" if args.explain else "false" CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( title="general crawler arguments", description="arguments common to all crawlers", ) CRAWLER_PARSER_GROUP.add_argument( "--redownload", type=Redownload.from_string, metavar="OPTION", help="when to redownload a file that's already present locally" ) CRAWLER_PARSER_GROUP.add_argument( "--on-conflict", type=OnConflict.from_string, metavar="OPTION", help="what to do when local and remote files or directories differ" ) CRAWLER_PARSER_GROUP.add_argument( "--transform", "-t", action="append", type=str, metavar="RULE", help="add a single transformation rule. Can be specified multiple times" ) CRAWLER_PARSER_GROUP.add_argument( "--max-concurrent-tasks", type=int, metavar="N", help="maximum number of concurrent tasks (crawling, downloading)" ) CRAWLER_PARSER_GROUP.add_argument( "--max-concurrent-downloads", type=int, metavar="N", help="maximum number of tasks that may download data at the same time" ) CRAWLER_PARSER_GROUP.add_argument( "--delay-between-tasks", type=float, metavar="SECONDS", help="time the crawler should wait between subsequent tasks" ) def load_crawler( args: argparse.Namespace, section: configparser.SectionProxy, ) -> None: if args.redownload is not None: section["redownload"] = args.redownload.value if args.on_conflict is not None: section["on_conflict"] = args.on_conflict.value if args.transform is not None: section["transform"] = "\n" + "\n".join(args.transform) if args.max_concurrent_tasks is not None: section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) if args.max_concurrent_downloads is not None: section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) if args.delay_between_tasks is not None: section["delay_between_tasks"] = str(args.delay_between_tasks) PARSER = argparse.ArgumentParser(parents=[GENERAL_PARSER]) PARSER.set_defaults(command=None) SUBPARSERS = PARSER.add_subparsers(title="crawlers") LOCAL_CRAWLER = SUBPARSERS.add_parser( "local", parents=[CRAWLER_PARSER], ) LOCAL_CRAWLER.set_defaults(command="local") LOCAL_CRAWLER_GROUP = LOCAL_CRAWLER.add_argument_group( title="local crawler arguments", description="arguments for the 'local' crawler", ) LOCAL_CRAWLER_GROUP.add_argument( "target", type=Path, metavar="TARGET", help="directory to crawl" ) LOCAL_CRAWLER_GROUP.add_argument( "output", type=Path, metavar="OUTPUT", help="output directory" ) LOCAL_CRAWLER_GROUP.add_argument( "--crawl-delay", type=float, metavar="SECONDS", help="artificial delay to simulate for crawl requests" ) LOCAL_CRAWLER_GROUP.add_argument( "--download-delay", type=float, metavar="SECONDS", help="artificial delay to simulate for download requests" ) LOCAL_CRAWLER_GROUP.add_argument( "--download-speed", type=int, metavar="BYTES_PER_SECOND", help="download speed to simulate" ) def load_local_crawler( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: parser["crawl:local"] = {} section = parser["crawl:local"] load_crawler(args, section) section["type"] = "local" section["target"] = str(args.target) section["output_dir"] = str(args.output) if args.crawl_delay is not None: section["crawl_delay"] = str(args.crawl_delay) if args.download_delay is not None: section["download_delay"] = str(args.download_delay) if args.download_speed is not None: section["download_speed"] = str(args.download_speed) def load_parser( args: argparse.Namespace, ) -> configparser.ConfigParser: log.explain_topic("Loading config") parser = configparser.ConfigParser() if args.command is None: log.explain("No CLI command specified, loading config from file") Config.load_parser(parser, path=args.config) else: log.explain(f"CLI command specified, creating config for {args.command!r}") if args.command == "local": load_local_crawler(args, parser) load_general(args, parser) prune_crawlers(args, parser) return parser def prune_crawlers( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: if not args.crawler: return for section in parser.sections(): if section.startswith("crawl:"): # TODO Use removeprefix() when switching to 3.9 name = section[len("crawl:"):] if name not in args.crawler: parser.remove_section(section) # TODO Check if crawlers actually exist def main() -> None: args = PARSER.parse_args() # Configure log levels set by command line arguments if args.explain is not None: log.output_explain = args.explain if args.dump_config: log.output_explain = False if args.version: print(f"{NAME} {VERSION}") exit() try: config = Config(load_parser(args)) except ConfigLoadException as e: log.error(f"Failed to load config file at path {str(e.path)!r}") log.error_contd(f"Reason: {e.reason}") exit(1) # Configure log levels set in the config file # TODO Catch config section exceptions if args.explain is None: log.output_explain = config.default_section.explain() if args.dump_config is not None: try: if args.dump_config is True: config.dump() elif args.dump_config == "-": config.dump_to_stdout() else: config.dump(Path(args.dump_config)) except ConfigDumpException: exit(1) exit() pferd = Pferd(config) try: asyncio.run(pferd.run()) except KeyboardInterrupt: # TODO Clean up tmp files pass