diff --git a/PFERD/__main__.py b/PFERD/__main__.py index a16b19b..5815f40 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -1,40 +1,229 @@ import argparse import asyncio +import configparser from pathlib import Path from .config import Config, ConfigDumpException, ConfigLoadException +from .output_dir import OnConflict, Redownload from .pferd import Pferd +GENERAL_PARSER = argparse.ArgumentParser(add_help=False) +GENERAL_PARSER.add_argument( + "--config", "-c", + type=Path, + metavar="PATH", + help="custom config file" +) +GENERAL_PARSER.add_argument( + "--dump-config", + nargs="?", + const=True, + metavar="PATH", + help="dump current configuration to a file and exit." + " Uses default config file path if no path is specified" +) +GENERAL_PARSER.add_argument( + "--crawler", + action="append", + type=str, + metavar="NAME", + help="only execute a single crawler." + " Can be specified multiple times to execute multiple crawlers" +) +GENERAL_PARSER.add_argument( + "--working-dir", + type=Path, + metavar="PATH", + help="custom working directory" +) + + +def load_general( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + section = parser[parser.default_section] + + if args.working_dir is not None: + section["working_dir"] = str(args.working_dir) + + +CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) +CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( + title="general crawler arguments", + description="arguments common to all crawlers", +) +CRAWLER_PARSER_GROUP.add_argument( + "--redownload", + type=Redownload.from_string, + metavar="OPTION", + help="when to redownload a file that's already present locally" +) +CRAWLER_PARSER_GROUP.add_argument( + "--on-conflict", + type=OnConflict.from_string, + metavar="OPTION", + help="what to do when local and remote files or directories differ" +) +CRAWLER_PARSER_GROUP.add_argument( + "--transform", "-t", + action="append", + type=str, + metavar="RULE", + help="add a single transformation rule. Can be specified multiple times" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-tasks", + type=int, + metavar="N", + help="maximum number of concurrent tasks (crawling, downloading)" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-downloads", + type=int, + metavar="N", + help="maximum number of tasks that may download data at the same time" +) +CRAWLER_PARSER_GROUP.add_argument( + "--delay-between-tasks", + type=float, + metavar="SECONDS", + help="time the crawler should wait between subsequent tasks" +) + + +def load_crawler( + args: argparse.Namespace, + section: configparser.SectionProxy, +) -> None: + if args.redownload is not None: + section["redownload"] = args.redownload.value + if args.on_conflict is not None: + section["on_conflict"] = args.on_conflict.value + if args.transform is not None: + section["transform"] = "\n" + "\n".join(args.transform) + if args.max_concurrent_tasks is not None: + section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) + if args.max_concurrent_downloads is not None: + section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) + if args.delay_between_tasks is not None: + section["delay_between_tasks"] = str(args.delay_between_tasks) + + +PARSER = argparse.ArgumentParser(parents=[GENERAL_PARSER]) +PARSER.set_defaults(command=None) +SUBPARSERS = PARSER.add_subparsers(title="crawlers") + + +LOCAL_CRAWLER = SUBPARSERS.add_parser( + "local", + parents=[GENERAL_PARSER, CRAWLER_PARSER], +) +LOCAL_CRAWLER.set_defaults(command="local") +LOCAL_CRAWLER_GROUP = LOCAL_CRAWLER.add_argument_group( + title="local crawler arguments", + description="arguments for the 'local' crawler", +) +LOCAL_CRAWLER_GROUP.add_argument( + "target", + type=Path, + metavar="TARGET", + help="directory to crawl" +) +LOCAL_CRAWLER_GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) +LOCAL_CRAWLER_GROUP.add_argument( + "--crawl-delay", + type=float, + metavar="SECONDS", + help="artificial delay to simulate for crawl requests" +) +LOCAL_CRAWLER_GROUP.add_argument( + "--download-delay", + type=float, + metavar="SECONDS", + help="artificial delay to simulate for download requests" +) +LOCAL_CRAWLER_GROUP.add_argument( + "--download-speed", + type=int, + metavar="BYTES_PER_SECOND", + help="download speed to simulate" +) + + +def load_local_crawler( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + parser["crawl:local"] = {} + section = parser["crawl:local"] + load_crawler(args, section) + + section["type"] = "local" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + if args.crawl_delay is not None: + section["crawl_delay"] = str(args.crawl_delay) + if args.download_delay is not None: + section["download_delay"] = str(args.download_delay) + if args.download_speed is not None: + section["download_speed"] = str(args.download_speed) + + +def load_parser( + args: argparse.Namespace, +) -> configparser.ConfigParser: + parser = configparser.ConfigParser() + + if args.command is None: + Config.load_parser(parser, path=args.config) + elif args.command == "local": + load_local_crawler(args, parser) + + load_general(args, parser) + prune_crawlers(args, parser) + + return parser + + +def prune_crawlers( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + if not args.crawler: + return + + for section in parser.sections(): + if section.startswith("crawl:"): + # TODO Use removeprefix() when switching to 3.9 + name = section[len("crawl:"):] + if name not in args.crawler: + parser.remove_section(section) + + # TODO Check if crawlers actually exist + def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument( - "--config", "-c", - type=Path, - metavar="PATH", - help="specify custom config file path", - ) - parser.add_argument( - "--dump-config", - nargs="?", - const=True, - type=Path, - metavar="PATH", - help="dump current configuration to a file and exit." - " Uses default config file path if no path is specified", - ) - args = parser.parse_args() + args = PARSER.parse_args() try: - config_parser = Config.load_parser(args.config) - config = Config(config_parser) + config = Config(load_parser(args)) except ConfigLoadException: exit(1) - if args.dump_config: - path = None if args.dump_config is True else args.dump_config + if args.dump_config is not None: try: - config.dump(path) + if args.dump_config is True: + config.dump() + elif args.dump_config == "-": + config.dump_to_stdout() + else: + config.dump(Path(args.dump_config)) except ConfigDumpException: exit(1) exit() diff --git a/PFERD/config.py b/PFERD/config.py index 7a7e832..7fe5d9e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -1,4 +1,6 @@ +import asyncio import os +import sys from configparser import ConfigParser, SectionProxy from dataclasses import dataclass from pathlib import Path @@ -68,7 +70,7 @@ class Config: raise ConfigLoadException() @staticmethod - def load_parser(path: Optional[Path] = None) -> ConfigParser: + def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: """ May throw a ConfigLoadException. """ @@ -76,8 +78,6 @@ class Config: if not path: path = Config._default_path() - parser = ConfigParser() - # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. try: @@ -90,8 +90,6 @@ class Config: except PermissionError: Config._fail_load(path, "Insufficient permissions") - return parser - @staticmethod def _fail_dump(path: Path, reason: str) -> None: print(f"Failed to dump config file to {path}") @@ -123,7 +121,7 @@ class Config: self._parser.write(f) except FileExistsError: print("That file already exists.") - if prompt_yes_no("Overwrite it?", default=False): + if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): with open(path, "w") as f: self._parser.write(f) else: @@ -133,6 +131,9 @@ class Config: except PermissionError: self._fail_dump(path, "Insufficient permissions") + def dump_to_stdout(self) -> None: + self._parser.write(sys.stdout) + @property def default_section(self) -> SectionProxy: return self._parser[self._parser.default_section] diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4148614..140ae20 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -117,37 +117,25 @@ class CrawlerSection(Section): def redownload(self) -> Redownload: value = self.s.get("redownload", "never-smart") - if value == "never": - return Redownload.NEVER - elif value == "never-smart": - return Redownload.NEVER_SMART - elif value == "always": - return Redownload.ALWAYS - elif value == "always-smart": - return Redownload.ALWAYS_SMART - - self.invalid_value( - "redownload", - value, - "Must be 'never', 'never-smart', 'always' or 'always-smart'" - ) + try: + return Redownload.from_string(value) + except ValueError as e: + self.invalid_value( + "redownload", + value, + str(e).capitalize(), + ) def on_conflict(self) -> OnConflict: value = self.s.get("on_conflict", "prompt") - if value == "prompt": - return OnConflict.PROMPT - elif value == "local-first": - return OnConflict.LOCAL_FIRST - elif value == "remote-first": - return OnConflict.REMOTE_FIRST - elif value == "no-delete": - return OnConflict.NO_DELETE - - self.invalid_value( - "on_conflict", - value, - "Must be 'prompt', 'local-first', 'remote-first' or 'no-delete'", - ) + try: + return OnConflict.from_string(value) + except ValueError as e: + self.invalid_value( + "on_conflict", + value, + str(e).capitalize(), + ) def transform(self) -> str: return self.s.get("transform", "") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 89c5839..4f5f708 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -32,6 +32,14 @@ class Redownload(Enum): ALWAYS = "always" ALWAYS_SMART = "always-smart" + @staticmethod + def from_string(string: str) -> "Redownload": + try: + return Redownload(string) + except ValueError: + raise ValueError("must be one of 'never', 'never-smart'," + " 'always', 'always-smart'") + class OnConflict(Enum): PROMPT = "prompt" @@ -39,6 +47,14 @@ class OnConflict(Enum): REMOTE_FIRST = "remote-first" NO_DELETE = "no-delete" + @staticmethod + def from_string(string: str) -> "OnConflict": + try: + return OnConflict(string) + except ValueError: + raise ValueError("must be one of 'prompt', 'local-first'," + " 'remote-first', 'no-delete'") + @dataclass class Heuristics: