2021-05-13 23:52:46 +02:00
|
|
|
import argparse
|
|
|
|
import asyncio
|
2021-05-15 21:33:51 +02:00
|
|
|
import configparser
|
2021-05-31 11:41:20 +02:00
|
|
|
import os
|
2021-05-25 17:33:05 +02:00
|
|
|
import sys
|
2021-05-13 23:52:46 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
2021-06-01 09:10:58 +00:00
|
|
|
from .auth import AuthLoadError
|
2021-05-31 18:19:05 +02:00
|
|
|
from .cli import PARSER, ParserLoadError, load_default_section
|
2021-05-22 16:47:24 +02:00
|
|
|
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
2021-05-19 17:48:51 +02:00
|
|
|
from .logging import log
|
2021-05-23 18:16:25 +02:00
|
|
|
from .pferd import Pferd, PferdLoadError
|
2021-05-22 20:38:56 +00:00
|
|
|
from .transformer import RuleParseError
|
2021-05-13 23:52:46 +02:00
|
|
|
|
2021-05-15 21:33:51 +02:00
|
|
|
|
2021-05-22 16:47:24 +02:00
|
|
|
def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser:
|
2021-05-19 18:10:17 +02:00
|
|
|
log.explain_topic("Loading config")
|
2021-10-25 22:32:54 +02:00
|
|
|
parser = configparser.ConfigParser(interpolation=None)
|
2021-05-15 21:33:51 +02:00
|
|
|
|
|
|
|
if args.command is None:
|
2021-05-19 18:10:17 +02:00
|
|
|
log.explain("No CLI command specified, loading config from file")
|
2021-05-15 21:33:51 +02:00
|
|
|
Config.load_parser(parser, path=args.config)
|
2021-05-19 18:10:17 +02:00
|
|
|
else:
|
2021-05-26 10:52:04 +02:00
|
|
|
log.explain("CLI command specified, loading config from its arguments")
|
2021-05-22 15:03:45 +02:00
|
|
|
if args.command:
|
|
|
|
args.command(args, parser)
|
2021-05-15 21:33:51 +02:00
|
|
|
|
2021-05-22 15:03:45 +02:00
|
|
|
load_default_section(args, parser)
|
2021-05-15 21:33:51 +02:00
|
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
2021-05-22 16:47:24 +02:00
|
|
|
def load_config(args: argparse.Namespace) -> Config:
|
|
|
|
try:
|
|
|
|
return Config(load_config_parser(args))
|
|
|
|
except ConfigLoadError as e:
|
|
|
|
log.error(str(e))
|
|
|
|
log.error_contd(e.reason)
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-31 18:19:05 +02:00
|
|
|
except ParserLoadError as e:
|
|
|
|
log.error(str(e))
|
|
|
|
sys.exit(1)
|
2021-05-22 16:47:24 +02:00
|
|
|
|
2021-05-13 23:52:46 +02:00
|
|
|
|
2021-05-22 16:47:24 +02:00
|
|
|
def configure_logging_from_args(args: argparse.Namespace) -> None:
|
2021-05-19 17:48:51 +02:00
|
|
|
if args.explain is not None:
|
|
|
|
log.output_explain = args.explain
|
2021-05-23 22:41:59 +02:00
|
|
|
if args.status is not None:
|
|
|
|
log.output_status = args.status
|
|
|
|
if args.report is not None:
|
|
|
|
log.output_report = args.report
|
2021-05-22 16:47:24 +02:00
|
|
|
|
|
|
|
# We want to prevent any unnecessary output if we're printing the config to
|
|
|
|
# stdout, otherwise it would not be a valid config file.
|
2021-05-25 16:57:14 +02:00
|
|
|
if args.dump_config_to == "-":
|
2021-05-19 18:10:17 +02:00
|
|
|
log.output_explain = False
|
2021-05-23 22:41:59 +02:00
|
|
|
log.output_status = False
|
|
|
|
log.output_report = False
|
2021-05-19 17:48:51 +02:00
|
|
|
|
2021-05-22 16:47:24 +02:00
|
|
|
|
|
|
|
def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None:
|
|
|
|
# In configure_logging_from_args(), all normal logging is already disabled
|
|
|
|
# whenever we dump the config. We don't want to override that decision with
|
|
|
|
# values from the config file.
|
2021-05-25 16:57:14 +02:00
|
|
|
if args.dump_config_to == "-":
|
2021-05-22 16:47:24 +02:00
|
|
|
return
|
|
|
|
|
|
|
|
try:
|
|
|
|
if args.explain is None:
|
|
|
|
log.output_explain = config.default_section.explain()
|
2021-05-23 22:41:59 +02:00
|
|
|
if args.status is None:
|
|
|
|
log.output_status = config.default_section.status()
|
|
|
|
if args.report is None:
|
|
|
|
log.output_report = config.default_section.report()
|
2021-05-22 16:47:24 +02:00
|
|
|
except ConfigOptionError as e:
|
|
|
|
log.error(str(e))
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-22 16:47:24 +02:00
|
|
|
|
|
|
|
|
|
|
|
def dump_config(args: argparse.Namespace, config: Config) -> None:
|
2021-05-23 11:04:50 +02:00
|
|
|
log.explain_topic("Dumping config")
|
|
|
|
|
2021-05-25 16:57:14 +02:00
|
|
|
if args.dump_config and args.dump_config_to is not None:
|
|
|
|
log.error("--dump-config and --dump-config-to can't be specified at the same time")
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-25 16:57:14 +02:00
|
|
|
|
2021-05-22 16:47:24 +02:00
|
|
|
try:
|
2021-05-25 16:57:14 +02:00
|
|
|
if args.dump_config:
|
2021-05-22 16:47:24 +02:00
|
|
|
config.dump()
|
2021-05-25 16:57:14 +02:00
|
|
|
elif args.dump_config_to == "-":
|
2021-05-22 16:47:24 +02:00
|
|
|
config.dump_to_stdout()
|
|
|
|
else:
|
2021-05-25 16:57:14 +02:00
|
|
|
config.dump(Path(args.dump_config_to))
|
2021-05-22 16:47:24 +02:00
|
|
|
except ConfigDumpError as e:
|
|
|
|
log.error(str(e))
|
|
|
|
log.error_contd(e.reason)
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-22 16:47:24 +02:00
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
args = PARSER.parse_args()
|
|
|
|
|
|
|
|
# Configuring logging happens in two stages because CLI args have
|
|
|
|
# precedence over config file options and loading the config already
|
|
|
|
# produces some kinds of log messages (usually only explain()-s).
|
|
|
|
configure_logging_from_args(args)
|
2021-05-13 23:52:46 +02:00
|
|
|
|
2021-05-22 16:47:24 +02:00
|
|
|
config = load_config(args)
|
|
|
|
|
|
|
|
# Now, after loading the config file, we can apply its logging settings in
|
|
|
|
# all places that were not already covered by CLI args.
|
|
|
|
configure_logging_from_config(args, config)
|
2021-05-19 17:48:51 +02:00
|
|
|
|
2021-05-25 16:57:14 +02:00
|
|
|
if args.dump_config or args.dump_config_to is not None:
|
2021-05-22 16:47:24 +02:00
|
|
|
dump_config(args, config)
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit()
|
2021-05-13 23:52:46 +02:00
|
|
|
|
2021-05-22 21:05:32 +02:00
|
|
|
try:
|
2021-07-01 11:01:55 +02:00
|
|
|
pferd = Pferd(config, args.crawler, args.skip)
|
2021-05-25 15:35:36 +02:00
|
|
|
except PferdLoadError as e:
|
2021-05-22 20:25:58 +00:00
|
|
|
log.unlock()
|
2021-05-22 21:05:32 +02:00
|
|
|
log.error(str(e))
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-25 15:35:36 +02:00
|
|
|
|
|
|
|
try:
|
2021-05-31 11:41:20 +02:00
|
|
|
if os.name == "nt":
|
|
|
|
# A "workaround" for the windows event loop somehow crashing after
|
|
|
|
# asyncio.run() completes. See:
|
|
|
|
# https://bugs.python.org/issue39232
|
|
|
|
# https://github.com/encode/httpx/issues/914#issuecomment-780023632
|
|
|
|
# TODO Fix this properly
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
loop.run_until_complete(pferd.run(args.debug_transforms))
|
|
|
|
loop.run_until_complete(asyncio.sleep(1))
|
|
|
|
loop.close()
|
|
|
|
else:
|
|
|
|
asyncio.run(pferd.run(args.debug_transforms))
|
2021-06-01 09:10:58 +00:00
|
|
|
except (ConfigOptionError, AuthLoadError) as e:
|
2021-05-25 15:35:36 +02:00
|
|
|
log.unlock()
|
|
|
|
log.error(str(e))
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-22 20:38:56 +00:00
|
|
|
except RuleParseError as e:
|
|
|
|
log.unlock()
|
|
|
|
e.pretty_print()
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-22 14:45:32 +02:00
|
|
|
except KeyboardInterrupt:
|
2021-05-22 18:58:00 +02:00
|
|
|
log.unlock()
|
2021-05-22 16:47:24 +02:00
|
|
|
log.explain_topic("Interrupted, exiting immediately")
|
|
|
|
log.explain("Open files and connections are left for the OS to clean up")
|
2021-05-25 15:42:46 +02:00
|
|
|
pferd.print_report()
|
2021-05-22 14:45:32 +02:00
|
|
|
# TODO Clean up tmp files
|
2021-05-22 16:47:24 +02:00
|
|
|
# And when those files *do* actually get cleaned up properly,
|
2021-05-25 15:42:46 +02:00
|
|
|
# reconsider if this should really exit with 1
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-22 16:47:24 +02:00
|
|
|
except Exception:
|
2021-05-22 18:58:00 +02:00
|
|
|
log.unlock()
|
2021-05-22 16:47:24 +02:00
|
|
|
log.unexpected_exception()
|
2021-05-25 15:42:46 +02:00
|
|
|
pferd.print_report()
|
2021-05-25 17:33:05 +02:00
|
|
|
sys.exit(1)
|
2021-05-26 10:51:41 +02:00
|
|
|
else:
|
|
|
|
pferd.print_report()
|