From ecdedfa1cfa2bfac01a4b1f96046eaec146eb9fe Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 12:36:09 +0200 Subject: [PATCH] Add no-videos flag to ILIAS crawler --- CONFIG.md | 1 + PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index 29fc7e2..e92858f 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -136,6 +136,7 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S - `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional HTML page instead. +- `no-videos`: If this is set to true, PFERD will not crawl or download any videos. ## Authenticator types ### The `simple` authenticator diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 2f27683..f69d769 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -57,6 +57,9 @@ class KitIliasWebCrawlerSection(CrawlerSection): def link_file_use_plaintext(self) -> bool: return self.s.getboolean("link_file_plain_text", fallback=False) + def no_videos(self) -> bool: + return self.s.getboolean("no-videos", fallback=True) + _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, @@ -66,6 +69,13 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) +_VIDEO_ELEMENTS: Set[IliasElementType] = set([ + IliasElementType.VIDEO, + IliasElementType.VIDEO_PLAYER, + IliasElementType.VIDEO_FOLDER, + IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, +]) + AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) @@ -153,6 +163,7 @@ class KitIliasWebCrawler(HttpCrawler): self._target = section.target() self._link_file_redirect_delay = section.link_file_redirect_delay() self._link_file_use_plaintext = section.link_file_use_plaintext() + self._no_videos = section.no_videos() async def _run(self) -> None: if isinstance(self._target, int): @@ -240,6 +251,16 @@ class KitIliasWebCrawler(HttpCrawler): async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) + if element.type in _VIDEO_ELEMENTS: + log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") + if self._no_videos: + log.explain("Video crawling is disabled") + log.explain("Answer: no") + return + else: + log.explain("Video crawling is enabled") + log.explain("Answer: yes") + if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: