Add Links option to ilias crawler

This allows you to configure what type the link files should have and
whether to create them at all.
This commit is contained in:
I-Al-Istannen 2021-05-25 11:33:45 +02:00
parent c33de233dc
commit 69cb2a7734
2 changed files with 61 additions and 8 deletions

View File

@ -1,5 +1,8 @@
link_template_plain = "{{link}}"
link_template_rich = """
from enum import Enum
from typing import Optional
_link_template_plain = "{{link}}"
_link_template_fancy = """
<!DOCTYPE html>
<html lang="en">
<head>
@ -84,4 +87,35 @@ link_template_rich = """
</div>
</body>
</html>
""" # noqa: E501 line too long
""".strip() # noqa: E501 line too long
_link_template_internet_shortcut = """
[InternetShortcut]
URL={{link}}
""".strip()
class Links(Enum):
IGNORE = "ignore"
PLAIN = "plain"
FANCY = "fancy"
INTERNET_SHORTCUT = "internet-shortcut"
def template(self) -> Optional[str]:
if self == self.FANCY:
return _link_template_fancy
elif self == self.PLAIN:
return _link_template_plain
elif self == self.INTERNET_SHORTCUT:
return _link_template_internet_shortcut
elif self == self.IGNORE:
return None
raise ValueError("Missing switch case")
@staticmethod
def from_string(string: str) -> "Links":
try:
return Links(string)
except ValueError:
raise ValueError("must be one of 'ignore', 'plain',"
" 'html', 'internet-shortcut'")

View File

@ -14,7 +14,7 @@ from ...output_dir import FileSink, Redownload
from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlWarning, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import link_template_plain, link_template_rich
from .file_templates import Links
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
TargetType = Union[str, int]
@ -52,8 +52,16 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
def link_file_redirect_delay(self) -> int:
return self.s.getint("link_file_redirect_delay", fallback=-1)
def link_file_use_plaintext(self) -> bool:
return self.s.getboolean("link_file_plaintext", fallback=False)
def links(self) -> Links:
type_str: Optional[str] = self.s.get("links")
if type_str is None:
return Links.FANCY
try:
return Links.from_string(type_str)
except ValueError as e:
self.invalid_value("links", type_str, str(e).capitalize())
def videos(self) -> bool:
return self.s.getboolean("videos", fallback=False)
@ -166,7 +174,7 @@ class KitIliasWebCrawler(HttpCrawler):
self._target = section.target()
self._link_file_redirect_delay = section.link_file_redirect_delay()
self._link_file_use_plaintext = section.link_file_use_plaintext()
self._links = section.links()
self._videos = section.videos()
async def _run(self) -> None:
@ -292,6 +300,17 @@ class KitIliasWebCrawler(HttpCrawler):
raise CrawlWarning(f"Unknown element type: {element.type!r}")
async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}")
link_template_maybe = self._links.template()
if not link_template_maybe:
log.explain("Answer: No")
return
else:
log.explain("Answer: Yes")
link_template = link_template_maybe
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
return
@ -303,7 +322,7 @@ class KitIliasWebCrawler(HttpCrawler):
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url)
content = link_template_plain if self._link_file_use_plaintext else link_template_rich
content = link_template
content = content.replace("{{link}}", real_url)
content = content.replace("{{name}}", element.name)
content = content.replace("{{description}}", str(element.description))