mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add support for ILIAS learning modules
This commit is contained in:
parent
123a57beec
commit
68c398f1fe
@ -30,6 +30,7 @@ ambiguous situations.
|
|||||||
|
|
||||||
### Added
|
### Added
|
||||||
- `no-delete-prompt-override` conflict resolution strategy
|
- `no-delete-prompt-override` conflict resolution strategy
|
||||||
|
- support for ILIAS learning modules
|
||||||
|
|
||||||
## 3.4.3 - 2022-11-29
|
## 3.4.3 - 2022-11-29
|
||||||
|
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
|
||||||
|
from PFERD.utils import soupify
|
||||||
|
|
||||||
_link_template_plain = "{{link}}"
|
_link_template_plain = "{{link}}"
|
||||||
_link_template_fancy = """
|
_link_template_fancy = """
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
@ -94,6 +98,71 @@ _link_template_internet_shortcut = """
|
|||||||
URL={{link}}
|
URL={{link}}
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
_learning_module_template = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>{{name}}</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
* {
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
.center-flex {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
}
|
||||||
|
.nav {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<body class="center-flex">
|
||||||
|
{{body}}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str:
|
||||||
|
# Seems to be comments, ignore those.
|
||||||
|
for elem in body.select(".il-copg-mob-fullscreen-modal"):
|
||||||
|
elem.decompose()
|
||||||
|
|
||||||
|
nav_template = """
|
||||||
|
<div class="nav">
|
||||||
|
{{left}}
|
||||||
|
{{right}}
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
if prev and body.select_one(".ilc_page_lnav_LeftNavigation"):
|
||||||
|
text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip()
|
||||||
|
left = f'<a href="{prev}">{text}</a>'
|
||||||
|
else:
|
||||||
|
left = "<span></span>"
|
||||||
|
|
||||||
|
if next and body.select_one(".ilc_page_rnav_RightNavigation"):
|
||||||
|
text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip()
|
||||||
|
right = f'<a href="{next}">{text}</a>'
|
||||||
|
else:
|
||||||
|
right = "<span></span>"
|
||||||
|
|
||||||
|
if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"):
|
||||||
|
top_nav.replace_with(
|
||||||
|
soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode())
|
||||||
|
)
|
||||||
|
|
||||||
|
if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"):
|
||||||
|
bot_nav.replace_with(soupify(nav_template.replace(
|
||||||
|
"{{left}}", left).replace("{{right}}", right).encode())
|
||||||
|
)
|
||||||
|
|
||||||
|
body = body.prettify()
|
||||||
|
return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name)
|
||||||
|
|
||||||
|
|
||||||
class Links(Enum):
|
class Links(Enum):
|
||||||
IGNORE = "ignore"
|
IGNORE = "ignore"
|
||||||
|
@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup:
|
|||||||
dummy.decompose()
|
dummy.decompose()
|
||||||
if len(children) > 1:
|
if len(children) > 1:
|
||||||
continue
|
continue
|
||||||
if type(children[0]) == Comment:
|
if isinstance(type(children[0]), Comment):
|
||||||
dummy.decompose()
|
dummy.decompose()
|
||||||
|
|
||||||
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
|
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
|
||||||
|
@ -22,6 +22,7 @@ class IliasElementType(Enum):
|
|||||||
FOLDER = "folder"
|
FOLDER = "folder"
|
||||||
FORUM = "forum"
|
FORUM = "forum"
|
||||||
LINK = "link"
|
LINK = "link"
|
||||||
|
LEARNING_MODULE = "learning_module"
|
||||||
BOOKING = "booking"
|
BOOKING = "booking"
|
||||||
MEETING = "meeting"
|
MEETING = "meeting"
|
||||||
SURVEY = "survey"
|
SURVEY = "survey"
|
||||||
@ -71,6 +72,14 @@ class IliasForumThread:
|
|||||||
mtime: Optional[datetime]
|
mtime: Optional[datetime]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IliasLearningModulePage:
|
||||||
|
title: str
|
||||||
|
content: Tag
|
||||||
|
next_url: Optional[str]
|
||||||
|
previous_url: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
class IliasPage:
|
class IliasPage:
|
||||||
|
|
||||||
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
|
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
|
||||||
@ -136,6 +145,34 @@ class IliasPage:
|
|||||||
|
|
||||||
return BeautifulSoup(raw_html, "html.parser")
|
return BeautifulSoup(raw_html, "html.parser")
|
||||||
|
|
||||||
|
def get_learning_module_data(self) -> Optional[IliasLearningModulePage]:
|
||||||
|
if not self._is_learning_module_page():
|
||||||
|
return None
|
||||||
|
content = self._soup.select_one("#ilLMPageContent")
|
||||||
|
title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip()
|
||||||
|
return IliasLearningModulePage(
|
||||||
|
title=title,
|
||||||
|
content=content,
|
||||||
|
next_url=self._find_learning_module_next(),
|
||||||
|
previous_url=self._find_learning_module_prev()
|
||||||
|
)
|
||||||
|
|
||||||
|
def _find_learning_module_next(self) -> Optional[str]:
|
||||||
|
for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"):
|
||||||
|
url = self._abs_url_from_link(link)
|
||||||
|
if "baseClass=ilLMPresentationGUI" not in url:
|
||||||
|
continue
|
||||||
|
return url
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _find_learning_module_prev(self) -> Optional[str]:
|
||||||
|
for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"):
|
||||||
|
url = self._abs_url_from_link(link)
|
||||||
|
if "baseClass=ilLMPresentationGUI" not in url:
|
||||||
|
continue
|
||||||
|
return url
|
||||||
|
return None
|
||||||
|
|
||||||
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
|
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
|
||||||
form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x})
|
form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x})
|
||||||
if not form:
|
if not form:
|
||||||
@ -222,6 +259,12 @@ class IliasPage:
|
|||||||
return False
|
return False
|
||||||
return "target=copa_" in link.get("value")
|
return "target=copa_" in link.get("value")
|
||||||
|
|
||||||
|
def _is_learning_module_page(self) -> bool:
|
||||||
|
link = self._soup.find(id="current_perma_link")
|
||||||
|
if not link:
|
||||||
|
return False
|
||||||
|
return "target=pg_" in link.get("value")
|
||||||
|
|
||||||
def _contains_collapsed_future_meetings(self) -> bool:
|
def _contains_collapsed_future_meetings(self) -> bool:
|
||||||
return self._uncollapse_future_meetings_url() is not None
|
return self._uncollapse_future_meetings_url() is not None
|
||||||
|
|
||||||
@ -812,6 +855,9 @@ class IliasPage:
|
|||||||
if "cmdClass=ilobjtestgui" in parsed_url.query:
|
if "cmdClass=ilobjtestgui" in parsed_url.query:
|
||||||
return IliasElementType.TEST
|
return IliasElementType.TEST
|
||||||
|
|
||||||
|
if "baseClass=ilLMPresentationGUI" in parsed_url.query:
|
||||||
|
return IliasElementType.LEARNING_MODULE
|
||||||
|
|
||||||
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
||||||
# try to guess it from the image.
|
# try to guess it from the image.
|
||||||
|
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from collections.abc import Awaitable, Coroutine
|
from collections.abc import Awaitable, Coroutine
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Any, Callable, Dict, List, Optional, Set, Union, cast
|
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import yarl
|
import yarl
|
||||||
@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload
|
|||||||
from ...utils import fmt_path, soupify, url_set_query_param
|
from ...utils import fmt_path, soupify, url_set_query_param
|
||||||
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
from .file_templates import Links
|
from .file_templates import Links, learning_module_template
|
||||||
from .ilias_html_cleaner import clean, insert_base_markup
|
from .ilias_html_cleaner import clean, insert_base_markup
|
||||||
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement,
|
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
|
||||||
_sanitize_path_name, parse_ilias_forum_export)
|
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
|
||||||
|
|
||||||
TargetType = Union[str, int]
|
TargetType = Union[str, int]
|
||||||
|
|
||||||
@ -394,6 +397,8 @@ instance's greatest bottleneck.
|
|||||||
"[bright_black](surveys contain no relevant data)"
|
"[bright_black](surveys contain no relevant data)"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
elif element.type == IliasElementType.LEARNING_MODULE:
|
||||||
|
return await self._handle_learning_module(element, element_path)
|
||||||
elif element.type == IliasElementType.LINK:
|
elif element.type == IliasElementType.LINK:
|
||||||
return await self._handle_link(element, element_path)
|
return await self._handle_link(element, element_path)
|
||||||
elif element.type == IliasElementType.BOOKING:
|
elif element.type == IliasElementType.BOOKING:
|
||||||
@ -739,6 +744,135 @@ instance's greatest bottleneck.
|
|||||||
sink.file.write(content.encode("utf-8"))
|
sink.file.write(content.encode("utf-8"))
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
|
async def _handle_learning_module(
|
||||||
|
self,
|
||||||
|
element: IliasPageElement,
|
||||||
|
element_path: PurePath,
|
||||||
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
|
maybe_cl = await self.crawl(element_path)
|
||||||
|
if not maybe_cl:
|
||||||
|
return None
|
||||||
|
return self._crawl_learning_module(element, maybe_cl)
|
||||||
|
|
||||||
|
@_iorepeat(3, "crawling learning module")
|
||||||
|
@anoncritical
|
||||||
|
async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None:
|
||||||
|
elements: List[IliasLearningModulePage] = []
|
||||||
|
|
||||||
|
async with cl:
|
||||||
|
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
|
||||||
|
log.explain(f"URL: {element.url}")
|
||||||
|
soup = await self._get_page(element.url)
|
||||||
|
page = IliasPage(soup, element.url, None)
|
||||||
|
if next := page.get_learning_module_data():
|
||||||
|
elements.extend(await self._crawl_learning_module_direction(
|
||||||
|
cl.path, next.previous_url, "left"
|
||||||
|
))
|
||||||
|
elements.append(next)
|
||||||
|
elements.extend(await self._crawl_learning_module_direction(
|
||||||
|
cl.path, next.next_url, "right"
|
||||||
|
))
|
||||||
|
|
||||||
|
# Reflect their natural ordering in the file names
|
||||||
|
for index, lm_element in enumerate(elements):
|
||||||
|
lm_element.title = f"{index:02}_{lm_element.title}"
|
||||||
|
|
||||||
|
tasks: List[Awaitable[None]] = []
|
||||||
|
for index, elem in enumerate(elements):
|
||||||
|
prev_url = elements[index - 1].title if index > 0 else None
|
||||||
|
next_url = elements[index + 1].title if index < len(elements) - 1 else None
|
||||||
|
tasks.append(asyncio.create_task(
|
||||||
|
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
|
||||||
|
))
|
||||||
|
|
||||||
|
# And execute them
|
||||||
|
await self.gather(tasks)
|
||||||
|
|
||||||
|
async def _crawl_learning_module_direction(
|
||||||
|
self,
|
||||||
|
path: PurePath,
|
||||||
|
start_url: Optional[str],
|
||||||
|
dir: Union[Literal["left"], Literal["right"]]
|
||||||
|
) -> List[IliasLearningModulePage]:
|
||||||
|
elements: List[IliasLearningModulePage] = []
|
||||||
|
|
||||||
|
if not start_url:
|
||||||
|
return elements
|
||||||
|
|
||||||
|
next_element_url: Optional[str] = start_url
|
||||||
|
counter = 0
|
||||||
|
while next_element_url:
|
||||||
|
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
|
||||||
|
log.explain(f"URL: {next_element_url}")
|
||||||
|
soup = await self._get_page(next_element_url)
|
||||||
|
page = IliasPage(soup, next_element_url, None)
|
||||||
|
if next := page.get_learning_module_data():
|
||||||
|
elements.append(next)
|
||||||
|
if dir == "left":
|
||||||
|
next_element_url = next.previous_url
|
||||||
|
else:
|
||||||
|
next_element_url = next.next_url
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
@anoncritical
|
||||||
|
@_iorepeat(3, "saving learning module page")
|
||||||
|
async def _download_learning_module_page(
|
||||||
|
self,
|
||||||
|
parent_path: PurePath,
|
||||||
|
element: IliasLearningModulePage,
|
||||||
|
prev: Optional[str],
|
||||||
|
next: Optional[str]
|
||||||
|
) -> None:
|
||||||
|
path = parent_path / (_sanitize_path_name(element.title) + ".html")
|
||||||
|
maybe_dl = await self.download(path)
|
||||||
|
if not maybe_dl:
|
||||||
|
return
|
||||||
|
my_path = self._transformer.transform(maybe_dl.path)
|
||||||
|
if not my_path:
|
||||||
|
return
|
||||||
|
|
||||||
|
if prev:
|
||||||
|
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
|
||||||
|
if prev_p:
|
||||||
|
prev = os.path.relpath(prev_p, my_path.parent)
|
||||||
|
else:
|
||||||
|
prev = None
|
||||||
|
if next:
|
||||||
|
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
|
||||||
|
if next_p:
|
||||||
|
next = os.path.relpath(next_p, my_path.parent)
|
||||||
|
else:
|
||||||
|
next = None
|
||||||
|
|
||||||
|
async with maybe_dl as (bar, sink):
|
||||||
|
content = element.content
|
||||||
|
content = await self.internalize_images(content)
|
||||||
|
sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
|
||||||
|
sink.done()
|
||||||
|
|
||||||
|
async def internalize_images(self, tag: Tag) -> Tag:
|
||||||
|
"""
|
||||||
|
Tries to fetch ILIAS images and embed them as base64 data.
|
||||||
|
"""
|
||||||
|
log.explain_topic("Internalizing images")
|
||||||
|
for elem in tag.find_all(recursive=True):
|
||||||
|
if not isinstance(elem, Tag):
|
||||||
|
continue
|
||||||
|
if elem.name == "img":
|
||||||
|
if src := elem.attrs.get("src", None):
|
||||||
|
url = urljoin(_ILIAS_URL, src)
|
||||||
|
if not url.startswith(_ILIAS_URL):
|
||||||
|
continue
|
||||||
|
log.explain(f"Internalizing {url!r}")
|
||||||
|
img = await self._get_authenticated(url)
|
||||||
|
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
|
||||||
|
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
|
||||||
|
# For unknown reasons the protocol seems to be stripped.
|
||||||
|
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
||||||
|
return tag
|
||||||
|
|
||||||
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
|
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
|
||||||
auth_id = await self._current_auth_id()
|
auth_id = await self._current_auth_id()
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
@ -772,7 +906,7 @@ instance's greatest bottleneck.
|
|||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
data: dict[str, Union[str, List[str]]]
|
data: dict[str, Union[str, List[str]]]
|
||||||
) -> BeautifulSoup:
|
) -> bytes:
|
||||||
auth_id = await self._current_auth_id()
|
auth_id = await self._current_auth_id()
|
||||||
|
|
||||||
form_data = aiohttp.FormData()
|
form_data = aiohttp.FormData()
|
||||||
@ -792,6 +926,22 @@ instance's greatest bottleneck.
|
|||||||
return await request.read()
|
return await request.read()
|
||||||
raise CrawlError("post_authenticated failed even after authenticating")
|
raise CrawlError("post_authenticated failed even after authenticating")
|
||||||
|
|
||||||
|
async def _get_authenticated(self, url: str) -> bytes:
|
||||||
|
auth_id = await self._current_auth_id()
|
||||||
|
|
||||||
|
async with self.session.get(url, allow_redirects=False) as request:
|
||||||
|
if request.status == 200:
|
||||||
|
return await request.read()
|
||||||
|
|
||||||
|
# We weren't authenticated, so try to do that
|
||||||
|
await self.authenticate(auth_id)
|
||||||
|
|
||||||
|
# Retry once after authenticating. If this fails, we will die.
|
||||||
|
async with self.session.get(url, allow_redirects=False) as request:
|
||||||
|
if request.status == 200:
|
||||||
|
return await request.read()
|
||||||
|
raise CrawlError("get_authenticated failed even after authenticating")
|
||||||
|
|
||||||
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
|
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
|
||||||
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
|
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
|
||||||
@ _iorepeat(3, "Login", failure_is_error=True)
|
@ _iorepeat(3, "Login", failure_is_error=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user