mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add Links option to ilias crawler
This allows you to configure what type the link files should have and whether to create them at all.
This commit is contained in:
parent
c33de233dc
commit
69cb2a7734
@ -1,5 +1,8 @@
|
||||
link_template_plain = "{{link}}"
|
||||
link_template_rich = """
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
_link_template_plain = "{{link}}"
|
||||
_link_template_fancy = """
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
@ -84,4 +87,35 @@ link_template_rich = """
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
""" # noqa: E501 line too long
|
||||
""".strip() # noqa: E501 line too long
|
||||
|
||||
_link_template_internet_shortcut = """
|
||||
[InternetShortcut]
|
||||
URL={{link}}
|
||||
""".strip()
|
||||
|
||||
|
||||
class Links(Enum):
|
||||
IGNORE = "ignore"
|
||||
PLAIN = "plain"
|
||||
FANCY = "fancy"
|
||||
INTERNET_SHORTCUT = "internet-shortcut"
|
||||
|
||||
def template(self) -> Optional[str]:
|
||||
if self == self.FANCY:
|
||||
return _link_template_fancy
|
||||
elif self == self.PLAIN:
|
||||
return _link_template_plain
|
||||
elif self == self.INTERNET_SHORTCUT:
|
||||
return _link_template_internet_shortcut
|
||||
elif self == self.IGNORE:
|
||||
return None
|
||||
raise ValueError("Missing switch case")
|
||||
|
||||
@staticmethod
|
||||
def from_string(string: str) -> "Links":
|
||||
try:
|
||||
return Links(string)
|
||||
except ValueError:
|
||||
raise ValueError("must be one of 'ignore', 'plain',"
|
||||
" 'html', 'internet-shortcut'")
|
||||
|
@ -14,7 +14,7 @@ from ...output_dir import FileSink, Redownload
|
||||
from ...utils import fmt_path, soupify, url_set_query_param
|
||||
from ..crawler import CrawlError, CrawlWarning, anoncritical
|
||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||
from .file_templates import link_template_plain, link_template_rich
|
||||
from .file_templates import Links
|
||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
|
||||
|
||||
TargetType = Union[str, int]
|
||||
@ -52,8 +52,16 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
|
||||
def link_file_redirect_delay(self) -> int:
|
||||
return self.s.getint("link_file_redirect_delay", fallback=-1)
|
||||
|
||||
def link_file_use_plaintext(self) -> bool:
|
||||
return self.s.getboolean("link_file_plaintext", fallback=False)
|
||||
def links(self) -> Links:
|
||||
type_str: Optional[str] = self.s.get("links")
|
||||
|
||||
if type_str is None:
|
||||
return Links.FANCY
|
||||
|
||||
try:
|
||||
return Links.from_string(type_str)
|
||||
except ValueError as e:
|
||||
self.invalid_value("links", type_str, str(e).capitalize())
|
||||
|
||||
def videos(self) -> bool:
|
||||
return self.s.getboolean("videos", fallback=False)
|
||||
@ -166,7 +174,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
|
||||
self._target = section.target()
|
||||
self._link_file_redirect_delay = section.link_file_redirect_delay()
|
||||
self._link_file_use_plaintext = section.link_file_use_plaintext()
|
||||
self._links = section.links()
|
||||
self._videos = section.videos()
|
||||
|
||||
async def _run(self) -> None:
|
||||
@ -292,6 +300,17 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
raise CrawlWarning(f"Unknown element type: {element.type!r}")
|
||||
|
||||
async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
|
||||
log.explain(f"Links type is {self._links}")
|
||||
|
||||
link_template_maybe = self._links.template()
|
||||
if not link_template_maybe:
|
||||
log.explain("Answer: No")
|
||||
return
|
||||
else:
|
||||
log.explain("Answer: Yes")
|
||||
link_template = link_template_maybe
|
||||
|
||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||
if not maybe_dl:
|
||||
return
|
||||
@ -303,7 +322,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
|
||||
real_url = await self._resolve_link_target(export_url)
|
||||
|
||||
content = link_template_plain if self._link_file_use_plaintext else link_template_rich
|
||||
content = link_template
|
||||
content = content.replace("{{link}}", real_url)
|
||||
content = content.replace("{{name}}", element.name)
|
||||
content = content.replace("{{description}}", str(element.description))
|
||||
|
Loading…
Reference in New Issue
Block a user