mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Overhaul config and CLI option names
This commit is contained in:
@ -11,14 +11,14 @@ SUBPARSER = SUBPARSERS.add_parser(
|
||||
)
|
||||
|
||||
GROUP = SUBPARSER.add_argument_group(
|
||||
title="KIT ILIAS web-crawler arguments",
|
||||
title="kit-ilias-web crawler arguments",
|
||||
description="arguments for the 'kit-ilias-web' crawler",
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"target",
|
||||
type=str,
|
||||
metavar="TARGET",
|
||||
help="course id, 'desktop', or ILIAS https-URL to crawl"
|
||||
help="course id, 'desktop', or ILIAS URL to crawl"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"output",
|
||||
@ -27,14 +27,9 @@ GROUP.add_argument(
|
||||
help="output directory"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"--videos",
|
||||
action=BooleanOptionalAction,
|
||||
help="crawl and download videos"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"--username",
|
||||
"--username", "-u",
|
||||
type=str,
|
||||
metavar="USER_NAME",
|
||||
metavar="USERNAME",
|
||||
help="user name for authentication"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
@ -46,19 +41,24 @@ GROUP.add_argument(
|
||||
"--links",
|
||||
type=show_value_error(Links.from_string),
|
||||
metavar="OPTION",
|
||||
help="how to treat external links"
|
||||
help="how to represent external links"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"--link-file-redirect-delay",
|
||||
"--link-redirect-delay",
|
||||
type=int,
|
||||
metavar="SECONDS",
|
||||
help="delay before external link files redirect you to their target (-1 to disable)"
|
||||
help="time before 'fancy' links redirect to to their target (-1 to disable)"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"--http-timeout",
|
||||
"--videos",
|
||||
action=BooleanOptionalAction,
|
||||
help="crawl and download videos"
|
||||
)
|
||||
GROUP.add_argument(
|
||||
"--http-timeout", "-t",
|
||||
type=float,
|
||||
metavar="SECONDS",
|
||||
help="the timeout to use for HTTP requests"
|
||||
help="timeout for all HTTP requests"
|
||||
)
|
||||
|
||||
|
||||
@ -66,33 +66,30 @@ def load(
|
||||
args: argparse.Namespace,
|
||||
parser: configparser.ConfigParser,
|
||||
) -> None:
|
||||
parser["crawl:kit-ilias-web"] = {}
|
||||
section = parser["crawl:kit-ilias-web"]
|
||||
parser["crawl:ilias"] = {}
|
||||
section = parser["crawl:ilias"]
|
||||
load_crawler(args, section)
|
||||
|
||||
section["type"] = "kit-ilias-web"
|
||||
section["target"] = str(args.target)
|
||||
section["output_dir"] = str(args.output)
|
||||
section["auth"] = "auth:kit-ilias-web"
|
||||
if args.link_file_redirect_delay is not None:
|
||||
section["link_file_redirect_delay"] = str(args.link_file_redirect_delay)
|
||||
section["auth"] = "auth:ilias"
|
||||
if args.links is not None:
|
||||
section["links"] = str(args.links.value)
|
||||
if args.link_redirect_delay is not None:
|
||||
section["link_redirect_delay"] = str(args.link_redirect_delay)
|
||||
if args.videos is not None:
|
||||
section["videos"] = str(False)
|
||||
section["videos"] = "yes" if args.videos else "no"
|
||||
if args.http_timeout is not None:
|
||||
section["http_timeout"] = str(args.http_timeout)
|
||||
|
||||
parser["auth:kit-ilias-web"] = {}
|
||||
auth_section = parser["auth:kit-ilias-web"]
|
||||
|
||||
parser["auth:ilias"] = {}
|
||||
auth_section = parser["auth:ilias"]
|
||||
auth_section["type"] = "simple"
|
||||
if args.username is not None:
|
||||
auth_section["username"] = args.username
|
||||
if args.keyring:
|
||||
auth_section["type"] = "keyring"
|
||||
else:
|
||||
auth_section["type"] = "simple"
|
||||
|
||||
if args.username is not None:
|
||||
auth_section["username"] = str(args.username)
|
||||
|
||||
|
||||
SUBPARSER.set_defaults(command=load)
|
||||
|
@ -77,10 +77,10 @@ CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group(
|
||||
description="arguments common to all crawlers",
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--redownload",
|
||||
"--redownload", "-r",
|
||||
type=show_value_error(Redownload.from_string),
|
||||
metavar="OPTION",
|
||||
help="when to redownload a file that's already present locally"
|
||||
help="when to download a file that's already present locally"
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--on-conflict",
|
||||
@ -89,30 +89,35 @@ CRAWLER_PARSER_GROUP.add_argument(
|
||||
help="what to do when local and remote files or directories differ"
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--transform", "-t",
|
||||
"--transform", "-T",
|
||||
action="append",
|
||||
type=str,
|
||||
metavar="RULE",
|
||||
help="add a single transformation rule. Can be specified multiple times"
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--max-concurrent-tasks",
|
||||
"--tasks", "-n",
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="maximum number of concurrent tasks (crawling, downloading)"
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--max-concurrent-downloads",
|
||||
"--downloads", "-N",
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="maximum number of tasks that may download data at the same time"
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--delay-between-tasks",
|
||||
"--task-delay", "-d",
|
||||
type=float,
|
||||
metavar="SECONDS",
|
||||
help="time the crawler should wait between subsequent tasks"
|
||||
)
|
||||
CRAWLER_PARSER_GROUP.add_argument(
|
||||
"--windows-paths",
|
||||
action=BooleanOptionalAction,
|
||||
help="whether to repair invalid paths on windows"
|
||||
)
|
||||
|
||||
|
||||
def load_crawler(
|
||||
@ -125,12 +130,14 @@ def load_crawler(
|
||||
section["on_conflict"] = args.on_conflict.value
|
||||
if args.transform is not None:
|
||||
section["transform"] = "\n" + "\n".join(args.transform)
|
||||
if args.max_concurrent_tasks is not None:
|
||||
section["max_concurrent_tasks"] = str(args.max_concurrent_tasks)
|
||||
if args.max_concurrent_downloads is not None:
|
||||
section["max_concurrent_downloads"] = str(args.max_concurrent_downloads)
|
||||
if args.delay_between_tasks is not None:
|
||||
section["delay_between_tasks"] = str(args.delay_between_tasks)
|
||||
if args.tasks is not None:
|
||||
section["tasks"] = str(args.tasks)
|
||||
if args.downloads is not None:
|
||||
section["downloads"] = str(args.downloads)
|
||||
if args.task_delay is not None:
|
||||
section["task_delay"] = str(args.task_delay)
|
||||
if args.windows_paths is not None:
|
||||
section["windows_paths"] = "yes" if args.windows_paths else "no"
|
||||
|
||||
|
||||
PARSER = argparse.ArgumentParser()
|
||||
@ -200,6 +207,10 @@ def load_default_section(
|
||||
section["working_dir"] = str(args.working_dir)
|
||||
if args.explain is not None:
|
||||
section["explain"] = "yes" if args.explain else "no"
|
||||
if args.status is not None:
|
||||
section["status"] = "yes" if args.status else "no"
|
||||
if args.report is not None:
|
||||
section["report"] = "yes" if args.report else "no"
|
||||
if args.share_cookies is not None:
|
||||
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
||||
|
||||
|
@ -169,33 +169,33 @@ class CrawlerSection(Section):
|
||||
def transform(self) -> str:
|
||||
return self.s.get("transform", "")
|
||||
|
||||
def max_concurrent_tasks(self) -> int:
|
||||
value = self.s.getint("max_concurrent_tasks", fallback=1)
|
||||
def tasks(self) -> int:
|
||||
value = self.s.getint("tasks", fallback=1)
|
||||
if value <= 0:
|
||||
self.invalid_value("max_concurrent_tasks", value,
|
||||
"Must be greater than 0")
|
||||
self.invalid_value("tasks", value, "Must be greater than 0")
|
||||
return value
|
||||
|
||||
def max_concurrent_downloads(self) -> int:
|
||||
tasks = self.max_concurrent_tasks()
|
||||
value = self.s.getint("max_concurrent_downloads", fallback=None)
|
||||
def downloads(self) -> int:
|
||||
tasks = self.tasks()
|
||||
value = self.s.getint("downloads", fallback=None)
|
||||
if value is None:
|
||||
return tasks
|
||||
if value <= 0:
|
||||
self.invalid_value("max_concurrent_downloads", value,
|
||||
"Must be greater than 0")
|
||||
self.invalid_value("downloads", value, "Must be greater than 0")
|
||||
if value > tasks:
|
||||
self.invalid_value("max_concurrent_downloads", value,
|
||||
"Must not be greater than max_concurrent_tasks")
|
||||
self.invalid_value("downloads", value, "Must not be greater than tasks")
|
||||
return value
|
||||
|
||||
def delay_between_tasks(self) -> float:
|
||||
value = self.s.getfloat("delay_between_tasks", fallback=0.0)
|
||||
def task_delay(self) -> float:
|
||||
value = self.s.getfloat("task_delay", fallback=0.0)
|
||||
if value < 0:
|
||||
self.invalid_value("delay_between_tasks", value,
|
||||
"Must not be negative")
|
||||
self.invalid_value("task_delay", value, "Must not be negative")
|
||||
return value
|
||||
|
||||
def windows_paths(self) -> bool:
|
||||
on_windows = os.name == "nt"
|
||||
return self.s.getboolean("windows_paths", fallback=on_windows)
|
||||
|
||||
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
||||
value = self.s.get("auth")
|
||||
if value is None:
|
||||
@ -205,10 +205,6 @@ class CrawlerSection(Section):
|
||||
self.invalid_value("auth", value, "No such auth section exists")
|
||||
return auth
|
||||
|
||||
def windows_paths(self) -> bool:
|
||||
on_windows = os.name == "nt"
|
||||
return self.s.getboolean("windows_paths", fallback=on_windows)
|
||||
|
||||
|
||||
class Crawler(ABC):
|
||||
def __init__(
|
||||
@ -230,9 +226,9 @@ class Crawler(ABC):
|
||||
self.error_free = True
|
||||
|
||||
self._limiter = Limiter(
|
||||
task_limit=section.max_concurrent_tasks(),
|
||||
download_limit=section.max_concurrent_downloads(),
|
||||
task_delay=section.delay_between_tasks(),
|
||||
task_limit=section.tasks(),
|
||||
download_limit=section.downloads(),
|
||||
task_delay=section.task_delay(),
|
||||
)
|
||||
|
||||
self._deduplicator = Deduplicator(section.windows_paths())
|
||||
|
@ -40,18 +40,14 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
|
||||
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
|
||||
|
||||
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||
value = self.s.get("tfa_auth")
|
||||
if not value:
|
||||
value: Optional[str] = self.s.get("tfa_auth")
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
auth = authenticators.get(f"auth:{value}")
|
||||
auth = authenticators.get(value)
|
||||
if auth is None:
|
||||
self.invalid_value("auth", value, "No such auth section exists")
|
||||
self.invalid_value("tfa_auth", value, "No such auth section exists")
|
||||
return auth
|
||||
|
||||
def link_file_redirect_delay(self) -> int:
|
||||
return self.s.getint("link_file_redirect_delay", fallback=-1)
|
||||
|
||||
def links(self) -> Links:
|
||||
type_str: Optional[str] = self.s.get("links")
|
||||
|
||||
@ -63,6 +59,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
|
||||
except ValueError as e:
|
||||
self.invalid_value("links", type_str, str(e).capitalize())
|
||||
|
||||
def link_redirect_delay(self) -> int:
|
||||
return self.s.getint("link_redirect_delay", fallback=-1)
|
||||
|
||||
def videos(self) -> bool:
|
||||
return self.s.getboolean("videos", fallback=False)
|
||||
|
||||
@ -173,7 +172,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
self._base_url = "https://ilias.studium.kit.edu"
|
||||
|
||||
self._target = section.target()
|
||||
self._link_file_redirect_delay = section.link_file_redirect_delay()
|
||||
self._link_file_redirect_delay = section.link_redirect_delay()
|
||||
self._links = section.links()
|
||||
self._videos = section.videos()
|
||||
|
||||
|
Reference in New Issue
Block a user