diff --git a/CHANGELOG.md b/CHANGELOG.md
index ee55659..6e3925c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,7 @@ ambiguous situations.
### Added
- `no-delete-prompt-override` conflict resolution strategy
+- support for ILIAS learning modules
## 3.4.3 - 2022-11-29
diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py
index 59123a2..b206461 100644
--- a/PFERD/crawl/ilias/file_templates.py
+++ b/PFERD/crawl/ilias/file_templates.py
@@ -1,6 +1,10 @@
from enum import Enum
from typing import Optional
+import bs4
+
+from PFERD.utils import soupify
+
_link_template_plain = "{{link}}"
_link_template_fancy = """
@@ -94,6 +98,71 @@ _link_template_internet_shortcut = """
URL={{link}}
""".strip()
+_learning_module_template = """
+
+
+
+
+ {{name}}
+
+
+
+
+{{body}}
+
+
+"""
+
+
+def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str:
+ # Seems to be comments, ignore those.
+ for elem in body.select(".il-copg-mob-fullscreen-modal"):
+ elem.decompose()
+
+ nav_template = """
+
+ {{left}}
+ {{right}}
+
+ """
+ if prev and body.select_one(".ilc_page_lnav_LeftNavigation"):
+ text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip()
+ left = f'{text}'
+ else:
+ left = ""
+
+ if next and body.select_one(".ilc_page_rnav_RightNavigation"):
+ text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip()
+ right = f'{text}'
+ else:
+ right = ""
+
+ if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"):
+ top_nav.replace_with(
+ soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode())
+ )
+
+ if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"):
+ bot_nav.replace_with(soupify(nav_template.replace(
+ "{{left}}", left).replace("{{right}}", right).encode())
+ )
+
+ body = body.prettify()
+ return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name)
+
class Links(Enum):
IGNORE = "ignore"
diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py
index 5952309..5495304 100644
--- a/PFERD/crawl/ilias/ilias_html_cleaner.py
+++ b/PFERD/crawl/ilias/ilias_html_cleaner.py
@@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup:
dummy.decompose()
if len(children) > 1:
continue
- if type(children[0]) == Comment:
+ if isinstance(type(children[0]), Comment):
dummy.decompose()
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index aed2069..46a8073 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -22,6 +22,7 @@ class IliasElementType(Enum):
FOLDER = "folder"
FORUM = "forum"
LINK = "link"
+ LEARNING_MODULE = "learning_module"
BOOKING = "booking"
MEETING = "meeting"
SURVEY = "survey"
@@ -71,6 +72,14 @@ class IliasForumThread:
mtime: Optional[datetime]
+@dataclass
+class IliasLearningModulePage:
+ title: str
+ content: Tag
+ next_url: Optional[str]
+ previous_url: Optional[str]
+
+
class IliasPage:
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
@@ -136,6 +145,34 @@ class IliasPage:
return BeautifulSoup(raw_html, "html.parser")
+ def get_learning_module_data(self) -> Optional[IliasLearningModulePage]:
+ if not self._is_learning_module_page():
+ return None
+ content = self._soup.select_one("#ilLMPageContent")
+ title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip()
+ return IliasLearningModulePage(
+ title=title,
+ content=content,
+ next_url=self._find_learning_module_next(),
+ previous_url=self._find_learning_module_prev()
+ )
+
+ def _find_learning_module_next(self) -> Optional[str]:
+ for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"):
+ url = self._abs_url_from_link(link)
+ if "baseClass=ilLMPresentationGUI" not in url:
+ continue
+ return url
+ return None
+
+ def _find_learning_module_prev(self) -> Optional[str]:
+ for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"):
+ url = self._abs_url_from_link(link)
+ if "baseClass=ilLMPresentationGUI" not in url:
+ continue
+ return url
+ return None
+
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x})
if not form:
@@ -222,6 +259,12 @@ class IliasPage:
return False
return "target=copa_" in link.get("value")
+ def _is_learning_module_page(self) -> bool:
+ link = self._soup.find(id="current_perma_link")
+ if not link:
+ return False
+ return "target=pg_" in link.get("value")
+
def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None
@@ -812,6 +855,9 @@ class IliasPage:
if "cmdClass=ilobjtestgui" in parsed_url.query:
return IliasElementType.TEST
+ if "baseClass=ilLMPresentationGUI" in parsed_url.query:
+ return IliasElementType.LEARNING_MODULE
+
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
# try to guess it from the image.
diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
index ae49edc..f82d684 100644
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -1,8 +1,11 @@
import asyncio
+import base64
+import os
import re
from collections.abc import Awaitable, Coroutine
from pathlib import PurePath
-from typing import Any, Callable, Dict, List, Optional, Set, Union, cast
+from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast
+from urllib.parse import urljoin
import aiohttp
import yarl
@@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload
from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
-from .file_templates import Links
+from .file_templates import Links, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup
-from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement,
- _sanitize_path_name, parse_ilias_forum_export)
+from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
+ IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
TargetType = Union[str, int]
@@ -394,6 +397,8 @@ instance's greatest bottleneck.
"[bright_black](surveys contain no relevant data)"
)
return None
+ elif element.type == IliasElementType.LEARNING_MODULE:
+ return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING:
@@ -739,6 +744,135 @@ instance's greatest bottleneck.
sink.file.write(content.encode("utf-8"))
sink.done()
+ async def _handle_learning_module(
+ self,
+ element: IliasPageElement,
+ element_path: PurePath,
+ ) -> Optional[Coroutine[Any, Any, None]]:
+ maybe_cl = await self.crawl(element_path)
+ if not maybe_cl:
+ return None
+ return self._crawl_learning_module(element, maybe_cl)
+
+ @_iorepeat(3, "crawling learning module")
+ @anoncritical
+ async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None:
+ elements: List[IliasLearningModulePage] = []
+
+ async with cl:
+ log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
+ log.explain(f"URL: {element.url}")
+ soup = await self._get_page(element.url)
+ page = IliasPage(soup, element.url, None)
+ if next := page.get_learning_module_data():
+ elements.extend(await self._crawl_learning_module_direction(
+ cl.path, next.previous_url, "left"
+ ))
+ elements.append(next)
+ elements.extend(await self._crawl_learning_module_direction(
+ cl.path, next.next_url, "right"
+ ))
+
+ # Reflect their natural ordering in the file names
+ for index, lm_element in enumerate(elements):
+ lm_element.title = f"{index:02}_{lm_element.title}"
+
+ tasks: List[Awaitable[None]] = []
+ for index, elem in enumerate(elements):
+ prev_url = elements[index - 1].title if index > 0 else None
+ next_url = elements[index + 1].title if index < len(elements) - 1 else None
+ tasks.append(asyncio.create_task(
+ self._download_learning_module_page(cl.path, elem, prev_url, next_url)
+ ))
+
+ # And execute them
+ await self.gather(tasks)
+
+ async def _crawl_learning_module_direction(
+ self,
+ path: PurePath,
+ start_url: Optional[str],
+ dir: Union[Literal["left"], Literal["right"]]
+ ) -> List[IliasLearningModulePage]:
+ elements: List[IliasLearningModulePage] = []
+
+ if not start_url:
+ return elements
+
+ next_element_url: Optional[str] = start_url
+ counter = 0
+ while next_element_url:
+ log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
+ log.explain(f"URL: {next_element_url}")
+ soup = await self._get_page(next_element_url)
+ page = IliasPage(soup, next_element_url, None)
+ if next := page.get_learning_module_data():
+ elements.append(next)
+ if dir == "left":
+ next_element_url = next.previous_url
+ else:
+ next_element_url = next.next_url
+ counter += 1
+
+ return elements
+
+ @anoncritical
+ @_iorepeat(3, "saving learning module page")
+ async def _download_learning_module_page(
+ self,
+ parent_path: PurePath,
+ element: IliasLearningModulePage,
+ prev: Optional[str],
+ next: Optional[str]
+ ) -> None:
+ path = parent_path / (_sanitize_path_name(element.title) + ".html")
+ maybe_dl = await self.download(path)
+ if not maybe_dl:
+ return
+ my_path = self._transformer.transform(maybe_dl.path)
+ if not my_path:
+ return
+
+ if prev:
+ prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
+ if prev_p:
+ prev = os.path.relpath(prev_p, my_path.parent)
+ else:
+ prev = None
+ if next:
+ next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
+ if next_p:
+ next = os.path.relpath(next_p, my_path.parent)
+ else:
+ next = None
+
+ async with maybe_dl as (bar, sink):
+ content = element.content
+ content = await self.internalize_images(content)
+ sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
+ sink.done()
+
+ async def internalize_images(self, tag: Tag) -> Tag:
+ """
+ Tries to fetch ILIAS images and embed them as base64 data.
+ """
+ log.explain_topic("Internalizing images")
+ for elem in tag.find_all(recursive=True):
+ if not isinstance(elem, Tag):
+ continue
+ if elem.name == "img":
+ if src := elem.attrs.get("src", None):
+ url = urljoin(_ILIAS_URL, src)
+ if not url.startswith(_ILIAS_URL):
+ continue
+ log.explain(f"Internalizing {url!r}")
+ img = await self._get_authenticated(url)
+ elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
+ if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
+ # For unknown reasons the protocol seems to be stripped.
+ elem.attrs["src"] = "https:" + elem.attrs["src"]
+ return tag
+
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
auth_id = await self._current_auth_id()
async with self.session.get(url) as request:
@@ -772,7 +906,7 @@ instance's greatest bottleneck.
self,
url: str,
data: dict[str, Union[str, List[str]]]
- ) -> BeautifulSoup:
+ ) -> bytes:
auth_id = await self._current_auth_id()
form_data = aiohttp.FormData()
@@ -792,6 +926,22 @@ instance's greatest bottleneck.
return await request.read()
raise CrawlError("post_authenticated failed even after authenticating")
+ async def _get_authenticated(self, url: str) -> bytes:
+ auth_id = await self._current_auth_id()
+
+ async with self.session.get(url, allow_redirects=False) as request:
+ if request.status == 200:
+ return await request.read()
+
+ # We weren't authenticated, so try to do that
+ await self.authenticate(auth_id)
+
+ # Retry once after authenticating. If this fails, we will die.
+ async with self.session.get(url, allow_redirects=False) as request:
+ if request.status == 200:
+ return await request.read()
+ raise CrawlError("get_authenticated failed even after authenticating")
+
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
@ _iorepeat(3, "Login", failure_is_error=True)