Add Links option to ilias crawler

This allows you to configure what type the link files should have and whether to create them at all.
2025-07-19 01:12:38 +02:00 · 2021-05-25 11:33:45 +02:00
parent c33de233dc
commit 69cb2a7734
2 changed files with 61 additions and 8 deletions
--- a/PFERD/crawl/ilias/file_templates.py
+++ b/PFERD/crawl/ilias/file_templates.py
@@ -1,5 +1,8 @@
-link_template_plain = "{{link}}"
-link_template_rich = """
+from enum import Enum
+from typing import Optional
+
+_link_template_plain = "{{link}}"
+_link_template_fancy = """
 <!DOCTYPE html>
 <html lang="en">
    <head>
@@ -84,4 +87,35 @@ link_template_rich = """
        </div>
    </body>
 </html>
-"""  # noqa: E501 line too long
+""".strip()  # noqa: E501 line too long
+
+_link_template_internet_shortcut = """
+[InternetShortcut]
+URL={{link}}
+""".strip()
+
+
+class Links(Enum):
+    IGNORE = "ignore"
+    PLAIN = "plain"
+    FANCY = "fancy"
+    INTERNET_SHORTCUT = "internet-shortcut"
+
+    def template(self) -> Optional[str]:
+        if self == self.FANCY:
+            return _link_template_fancy
+        elif self == self.PLAIN:
+            return _link_template_plain
+        elif self == self.INTERNET_SHORTCUT:
+            return _link_template_internet_shortcut
+        elif self == self.IGNORE:
+            return None
+        raise ValueError("Missing switch case")
+
+    @staticmethod
+    def from_string(string: str) -> "Links":
+        try:
+            return Links(string)
+        except ValueError:
+            raise ValueError("must be one of 'ignore', 'plain',"
+                             " 'html', 'internet-shortcut'")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -14,7 +14,7 @@ from ...output_dir import FileSink, Redownload
 from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import CrawlError, CrawlWarning, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
-from .file_templates import link_template_plain, link_template_rich
+from .file_templates import Links
 from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names

 TargetType = Union[str, int]
@@ -52,8 +52,16 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
    def link_file_redirect_delay(self) -> int:
        return self.s.getint("link_file_redirect_delay", fallback=-1)

-    def link_file_use_plaintext(self) -> bool:
-        return self.s.getboolean("link_file_plaintext", fallback=False)
+    def links(self) -> Links:
+        type_str: Optional[str] = self.s.get("links")
+
+        if type_str is None:
+            return Links.FANCY
+
+        try:
+            return Links.from_string(type_str)
+        except ValueError as e:
+            self.invalid_value("links", type_str, str(e).capitalize())

    def videos(self) -> bool:
        return self.s.getboolean("videos", fallback=False)
@@ -166,7 +174,7 @@ class KitIliasWebCrawler(HttpCrawler):

        self._target = section.target()
        self._link_file_redirect_delay = section.link_file_redirect_delay()
-        self._link_file_use_plaintext = section.link_file_use_plaintext()
+        self._links = section.links()
        self._videos = section.videos()

    async def _run(self) -> None:
@@ -292,6 +300,17 @@ class KitIliasWebCrawler(HttpCrawler):
            raise CrawlWarning(f"Unknown element type: {element.type!r}")

    async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
+        log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
+        log.explain(f"Links type is {self._links}")
+
+        link_template_maybe = self._links.template()
+        if not link_template_maybe:
+            log.explain("Answer: No")
+            return
+        else:
+            log.explain("Answer: Yes")
+        link_template = link_template_maybe
+
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
            return
@@ -303,7 +322,7 @@ class KitIliasWebCrawler(HttpCrawler):
                export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
                real_url = await self._resolve_link_target(export_url)

-                content = link_template_plain if self._link_file_use_plaintext else link_template_rich
+                content = link_template
                content = content.replace("{{link}}", real_url)
                content = content.replace("{{name}}", element.name)
                content = content.replace("{{description}}", str(element.description))