Compare commits

...

4 Commits

Author SHA1 Message Date
Julius Rüberg fd2c4f1e9c
Merge 77c1f1516c into e9f8901520 2023-12-04 21:11:37 -08:00
I-Al-Istannen e9f8901520 Fix typos in ilias crawler and use set literals 2023-11-30 20:57:57 +01:00
Julius Rüberg 77c1f1516c Used proper plural 2021-11-02 12:41:40 +01:00
Julius Rüberg 9e12e96d90 Added alias functionality 2021-11-02 03:42:08 +01:00
3 changed files with 31 additions and 11 deletions

View File

@ -92,6 +92,9 @@ common to all crawlers:
load for the crawl target. (Default: `0.0`)
- `windows_paths`: Whether PFERD should find alternative names for paths that
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
- `aliases`: List of strings that are considered as an alias when invoking with
the `--crawler` or `-C` flag. If there is more than one crawl section with
the same aliases all are selected. Thereby, you can group different crawlers.
Some crawlers may also require credentials for authentication. To configure how
the crawler obtains its credentials, the `auth` option is used. It is set to the
@ -106,6 +109,7 @@ username = foo
password = bar
[crawl:something]
aliases = [sth, some]
type = some-complex-crawler
auth = auth:example
on_conflict = no-delete

View File

@ -81,7 +81,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
return self.s.getboolean("forums", fallback=False)
_DIRECTORY_PAGES: Set[IliasElementType] = set([
_DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES,
IliasElementType.FOLDER,
@ -90,16 +90,16 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
IliasElementType.MEDIACAST_VIDEO_FOLDER,
IliasElementType.OPENCAST_VIDEO_FOLDER,
IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED,
])
}
_VIDEO_ELEMENTS: Set[IliasElementType] = set([
_VIDEO_ELEMENTS: Set[IliasElementType] = {
IliasElementType.MEDIACAST_VIDEO_FOLDER,
IliasElementType.MEDIACAST_VIDEO,
IliasElementType.OPENCAST_VIDEO,
IliasElementType.OPENCAST_VIDEO_PLAYER,
IliasElementType.OPENCAST_VIDEO_FOLDER,
IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED,
])
}
def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
@ -561,8 +561,8 @@ instance's greatest bottleneck.
# If we do not want to crawl it (user filter) or we have every file
# from the cached mapping already, we can ignore this and bail
if not maybe_dl or self._all_opencast_videos_locally_present(element_path):
# Mark all existing cideos as known so they do not get deleted
# during dleanup. We "downloaded" them, just without actually making
# Mark all existing videos as known so they do not get deleted
# during cleanup. We "downloaded" them, just without actually making
# a network request as we assumed they did not change.
for video in self._previous_contained_opencast_videos(element_path):
await self.download(video)

View File

@ -1,5 +1,5 @@
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Set
from rich.markup import escape
@ -43,16 +43,24 @@ class Pferd:
crawl_sections = [name for name, _ in config.crawl_sections()]
crawlers_to_run = [] # With crawl: prefix
crawlers_to_run = set() # With crawl: prefix
unknown_names = [] # Without crawl: prefix
for name in cli_crawlers:
section_name = f"crawl:{name}"
if section_name in crawl_sections:
log.explain(f"Crawler section named {section_name!r} exists")
crawlers_to_run.append(section_name)
else:
log.explain(f"There's no crawler section named {section_name!r}")
crawlers_to_run.add(section_name)
# interprete name as alias of a crawler
alias_names = self._find_crawlers_by_alias(name, config)
if alias_names:
crawlers_to_run.update(alias_names)
log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
for alias_name in alias_names:
log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
if not section_name in crawl_sections and not alias_names:
log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
unknown_names.append(name)
if unknown_names:
@ -65,6 +73,14 @@ class Pferd:
return crawlers_to_run
def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
alias_names = set()
for (section_name, section) in config.crawl_sections():
section_aliases = section.get("aliases", [])
if alias in section_aliases:
alias_names.add(section_name)
return alias_names
def _find_crawlers_to_run(
self,
config: Config,