Support ILIAS 9

This commit is contained in:
I-Al-Istannen
2025-04-12 14:54:58 +02:00
parent 63f25277b0
commit f6bdeb6b9d
3 changed files with 511 additions and 284 deletions

View File

@ -3,20 +3,100 @@ import re
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from enum import Enum
from typing import Dict, Optional, Union, cast
from typing import Callable, Dict, Optional, Union, cast
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag
from PFERD.crawl import CrawlError
from PFERD.crawl.crawler import CrawlWarning
from PFERD.logging import log
from PFERD.utils import url_set_query_params
TargetType = Union[str, int]
class TypeMatcher:
class UrlPath:
path: str
def __init__(self, path: str):
self.path = path
class UrlParameter:
query: str
def __init__(self, query: str):
self.query = query
class ImgSrc:
src: str
def __init__(self, src: str):
self.src = src
class ImgAlt:
alt: str
def __init__(self, alt: str):
self.alt = alt
class All:
matchers: list['IliasElementMatcher']
def __init__(self, matchers: list['IliasElementMatcher']):
self.matchers = matchers
class Any:
matchers: list['IliasElementMatcher']
def __init__(self, matchers: list['IliasElementMatcher']):
self.matchers = matchers
@staticmethod
def path(path: str) -> UrlPath:
return TypeMatcher.UrlPath(path)
@staticmethod
def query(query: str) -> UrlParameter:
return TypeMatcher.UrlParameter(query)
@staticmethod
def img_src(src: str) -> ImgSrc:
return TypeMatcher.ImgSrc(src)
@staticmethod
def img_alt(alt: str) -> ImgAlt:
return TypeMatcher.ImgAlt(alt)
@staticmethod
def all(*matchers: 'IliasElementMatcher') -> All:
return TypeMatcher.All(list(matchers))
@staticmethod
def any(*matchers: 'IliasElementMatcher') -> Any:
return TypeMatcher.Any(list(matchers))
@staticmethod
def never() -> Any:
return TypeMatcher.Any([])
IliasElementMatcher = (
TypeMatcher.UrlPath
| TypeMatcher.UrlParameter
| TypeMatcher.ImgSrc
| TypeMatcher.ImgAlt
| TypeMatcher.All
| TypeMatcher.Any
)
class IliasElementType(Enum):
BLOG = "blog"
BOOKING = "booking"
COURSE = "course"
DCL_RECORD_LIST = "dcl_record_list"
EXERCISE = "exercise"
EXERCISE_FILES = "exercise_files" # own submitted files
FILE = "file"
@ -25,7 +105,10 @@ class IliasElementType(Enum):
FORUM_THREAD = "forum_thread"
INFO_TAB = "info_tab"
LEARNING_MODULE = "learning_module"
LEARNING_MODULE_HTML = "learning_module_html"
LITERATURE_LIST = "literature_list"
LINK = "link"
MEDIA_POOL = "media_pool"
MEDIACAST_VIDEO = "mediacast_video"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
MEETING = "meeting"
@ -38,6 +121,131 @@ class IliasElementType(Enum):
SURVEY = "survey"
TEST = "test" # an online test. Will be ignored currently.
def matcher(self) -> IliasElementMatcher:
match self:
case IliasElementType.BLOG:
return TypeMatcher.any(
TypeMatcher.img_src("_blog.svg")
)
case IliasElementType.BOOKING:
return TypeMatcher.any(
TypeMatcher.path("/book/"),
TypeMatcher.img_src("_book.svg")
)
case IliasElementType.COURSE:
return TypeMatcher.any(TypeMatcher.path("/crs/"), TypeMatcher.img_src("_crsr.svg"))
case IliasElementType.DCL_RECORD_LIST:
return TypeMatcher.any(
TypeMatcher.img_src("_dcl.svg"),
TypeMatcher.query("cmdclass=ildclrecordlistgui")
)
case IliasElementType.EXERCISE:
return TypeMatcher.any(
TypeMatcher.path("/exc/"),
TypeMatcher.path("_exc_"),
TypeMatcher.img_src("_exc.svg"),
)
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.FILE:
return TypeMatcher.any(
TypeMatcher.query("cmd=sendfile"),
TypeMatcher.path("_file_"),
TypeMatcher.img_src("/filedelivery/"),
)
case IliasElementType.FOLDER:
return TypeMatcher.any(
TypeMatcher.path("/fold/"),
TypeMatcher.img_src("_fold.svg"),
TypeMatcher.path("/grp/"),
TypeMatcher.img_src("_grp.svg"),
TypeMatcher.path("/copa/"),
TypeMatcher.path("_copa_"),
TypeMatcher.img_src("_copa.svg"),
# Not supported right now but warn users
# TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
# TypeMatcher.img_alt("medienpool"),
# TypeMatcher.img_src("_mep.svg"),
)
case IliasElementType.FORUM:
return TypeMatcher.any(
TypeMatcher.path("/frm/"),
TypeMatcher.path("_frm_"),
TypeMatcher.img_src("_frm.svg"),
)
case IliasElementType.FORUM_THREAD:
return TypeMatcher.never()
case IliasElementType.INFO_TAB:
return TypeMatcher.never()
case IliasElementType.LITERATURE_LIST:
return TypeMatcher.img_src("_bibl.svg")
case IliasElementType.LEARNING_MODULE:
return TypeMatcher.any(
TypeMatcher.path("/lm/"),
TypeMatcher.img_src("_lm.svg")
)
case IliasElementType.LEARNING_MODULE_HTML:
return TypeMatcher.any(
TypeMatcher.query("baseclass=ilhtlmpresentationgui"),
TypeMatcher.img_src("_htlm.svg")
)
case IliasElementType.LINK:
return TypeMatcher.any(
TypeMatcher.all(
TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.query("calldirectlink"),
),
TypeMatcher.img_src("_webr.svg")
)
case IliasElementType.MEDIA_POOL:
return TypeMatcher.any(
TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
TypeMatcher.img_src("_mep.svg")
)
case IliasElementType.MEDIACAST_VIDEO:
return TypeMatcher.never()
case IliasElementType.MEDIACAST_VIDEO_FOLDER:
return TypeMatcher.any(
TypeMatcher.path("/mcst/"),
TypeMatcher.query("baseclass=ilmediacasthandlergui"),
TypeMatcher.img_src("_mcst.svg")
)
case IliasElementType.MEETING:
return TypeMatcher.any(
TypeMatcher.img_src("_sess.svg")
)
case IliasElementType.MOB_VIDEO:
return TypeMatcher.never()
case IliasElementType.OPENCAST_VIDEO:
return TypeMatcher.never()
case IliasElementType.OPENCAST_VIDEO_FOLDER:
return TypeMatcher.never()
case IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED:
return TypeMatcher.img_alt("opencast")
case IliasElementType.OPENCAST_VIDEO_PLAYER:
return TypeMatcher.never()
case IliasElementType.SCORM_LEARNING_MODULE:
return TypeMatcher.any(
TypeMatcher.query("baseclass=ilsahspresentationgui"),
TypeMatcher.img_src("_sahs.svg")
)
case IliasElementType.SURVEY:
return TypeMatcher.any(
TypeMatcher.path("/svy/"),
TypeMatcher.img_src("svy.svg")
)
case IliasElementType.TEST:
return TypeMatcher.any(
TypeMatcher.query("cmdclass=ilobjtestgui"),
TypeMatcher.query("cmdclass=iltestscreengui"),
TypeMatcher.img_src("_tst.svg")
)
raise CrawlWarning(f"Unknown matcher {self}")
@dataclass
class IliasPageElement:
@ -50,11 +258,20 @@ class IliasPageElement:
def id(self) -> str:
regexes = [
r"eid=(?P<id>[0-9a-z\-]+)",
r"file_(?P<id>\d+)",
r"copa_(?P<id>\d+)",
r"fold_(?P<id>\d+)",
r"frm_(?P<id>\d+)",
r"exc_(?P<id>\d+)",
r"book/(?P<id>\d+)", # booking
r"cat/(?P<id>\d+)",
r"copa/(?P<id>\d+)", # content page
r"crs/(?P<id>\d+)", # course
r"exc/(?P<id>\d+)", # exercise
r"file/(?P<id>\d+)", # file
r"fold/(?P<id>\d+)", # folder
r"frm/(?P<id>\d+)", # forum
r"grp/(?P<id>\d+)", # group
r"lm/(?P<id>\d+)", # learning module
r"mcst/(?P<id>\d+)", # mediacast
r"pg/(?P<id>(\d|_)+)", # page?
r"svy/(?P<id>\d+)", # survey
r"webr/(?P<id>\d+)", # web referene (link)
r"thr_pk=(?P<id>\d+)", # forums
r"ref_id=(?P<id>\d+)",
r"target=[a-z]+_(?P<id>\d+)",
@ -139,18 +356,28 @@ class IliasLearningModulePage:
previous_url: Optional[str]
class IliasSoup:
soup: BeautifulSoup
page_url: str
def __init__(self, soup: BeautifulSoup, page_url: str):
self.soup = soup
self.page_url = page_url
class IliasPage:
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
self._soup = soup
self._page_url = _page_url
def __init__(self, ilias_soup: IliasSoup, source_element: Optional[IliasPageElement]):
self._ilias_soup = ilias_soup
self._soup = ilias_soup.soup
self._page_url = ilias_soup.page_url
self._page_type = source_element.type if source_element else None
self._source_name = source_element.name if source_element else ""
@staticmethod
def is_root_page(soup: BeautifulSoup) -> bool:
def is_root_page(soup: IliasSoup) -> bool:
if permalink := IliasPage.get_soup_permalink(soup):
return "goto.php?target=root_" in permalink
return "goto.php/root/" in permalink
return False
def get_child_elements(self) -> list[IliasPageElement]:
@ -193,7 +420,10 @@ class IliasPage:
def get_description(self) -> Optional[BeautifulSoup]:
def is_interesting_class(name: str) -> bool:
return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
return name in [
"ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap",
"ilc_va_ihcap_AccordIHeadCap", "ilc_media_cont_MediaContainer"
]
paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class))
if not paragraphs:
@ -206,6 +436,21 @@ class IliasPage:
for p in paragraphs:
if p.find_parent(class_=is_interesting_class):
continue
if "ilc_media_cont_MediaContainer" in p["class"]:
# We have an embedded video which should be downloaded by _find_mob_videos
if video := p.select_one("video"):
url, title = self._find_mob_video_url_title(video, p)
raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
raw_html += 'display: flex; justify-content: center; align-items: center;'
raw_html += ' margin: 0.5rem;">'
if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
if url.startswith("//"):
url = "https:" + url
raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
else:
raw_html += f"Video elided. Filename: '{title}'."
raw_html += "</div>\n"
continue
# Ignore special listings (like folder groupings)
if "ilc_section_Special" in p["class"]:
@ -336,7 +581,7 @@ class IliasPage:
def _is_forum_page(self) -> bool:
if perma_link := self.get_permalink():
return "target=frm_" in perma_link
return "/frm/" in perma_link
return False
def _is_video_player(self) -> bool:
@ -378,7 +623,7 @@ class IliasPage:
def _is_content_page(self) -> bool:
if link := self.get_permalink():
return "target=copa_" in link
return "/copa/" in link
return False
def _is_learning_module_page(self) -> bool:
@ -513,19 +758,17 @@ class IliasPage:
# Configure button/link does not have anything interesting
continue
type = self._find_type_from_link(name, link, url)
if not type:
typ = IliasPage._find_type_for_element(
name, url, lambda: IliasPage._find_icon_for_folder_entry(link)
)
if not typ:
_unexpected_html_warning()
log.warn_contd(f"Could not extract type for {link}")
continue
log.explain(f"Found {name!r}")
log.explain(f"Found {name!r} of type {typ}")
if type == IliasElementType.FILE and "_download" not in url:
url = re.sub(r"(target=file_\d+)", r"\1_download", url)
log.explain("Rewired file URL to include download part")
items.append(IliasPageElement.create_new(type, url, name))
items.append(IliasPageElement.create_new(typ, url, name))
return items
@ -786,15 +1029,17 @@ class IliasPage:
for link in links:
abs_url = self._abs_url_from_link(link)
# Make sure parents are sanitized. We do not want accidental parents
parents = [_sanitize_path_name(x) for x in self._find_upwards_folder_hierarchy(link)]
parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
if parents:
element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
else:
element_name = _sanitize_path_name(link.get_text())
element_type = self._find_type_from_link(element_name, link, abs_url)
description = self._find_link_description(link)
element_type = IliasPage._find_type_for_element(
element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
)
description = IliasPage._find_link_description(link)
# The last meeting on every page is expanded by default.
# Its content is then shown inline *and* in the meeting page itself.
@ -805,10 +1050,10 @@ class IliasPage:
if not element_type:
continue
elif element_type == IliasElementType.FILE:
result.append(self._file_to_element(element_name, abs_url, link))
result.append(IliasPage._file_to_element(element_name, abs_url, link))
continue
log.explain(f"Found {element_name!r}")
log.explain(f"Found {element_name!r} of type {element_type}")
result.append(IliasPageElement.create_new(
element_type,
abs_url,
@ -826,50 +1071,60 @@ class IliasPage:
def _find_mediacast_videos(self) -> list[IliasPageElement]:
videos: list[IliasPageElement] = []
for elem in cast(list[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")):
element_name = _sanitize_path_name(
cast(Tag, elem.select_one(".ilPlayerPreviewDescription")).get_text().strip()
)
if not element_name.endswith(".mp4"):
# just to make sure it has some kinda-alrightish ending
element_name = element_name + ".mp4"
video_element = cast(Optional[Tag], elem.find(name="video"))
if not video_element:
_unexpected_html_warning()
log.warn_contd(f"No <video> element found for mediacast video '{element_name}'")
continue
regex = re.compile(r"il\.VideoPlaylist\.init.+?\[(.+?)], ")
for script in cast(list[Tag], self._soup.find_all("script")):
for match in regex.finditer(script.text):
try:
playlist = json.loads("[" + match.group(1) + "]")
except json.JSONDecodeError:
log.warn("Could not decode playlist json")
log.warn_contd(f"Playlist json: [{match.group(1)}]")
continue
for elem in playlist:
title = elem.get("title", None)
description = elem.get("description", None)
url = elem.get("resource", None)
if title is None or description is None or url is None:
log.explain(f"Mediacast json: {match.group(1)}")
log.warn("Mediacast video json was not complete")
if title is None:
log.warn_contd("Missing title")
if description is None:
log.warn_contd("Missing description")
if url is None:
log.warn_contd("Missing URL")
videos.append(IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(cast(str, video_element.get("src"))),
name=element_name,
mtime=self._find_mediacast_video_mtime(cast(Tag, elem.find_parent(name="td")))
))
if not title.endswith(".mp4") and not title.endswith(".webm"):
# just to make sure it has some kinda-alrightish ending
title = title + ".mp4"
videos.append(IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(cast(str, url)),
name=_sanitize_path_name(title)
))
return videos
def _find_mob_videos(self) -> list[IliasPageElement]:
videos: list[IliasPageElement] = []
for figure in self._soup.select("figure.ilc_media_cont_MediaContainerHighlighted"):
title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4"
selector = "figure.ilc_media_cont_MediaContainerHighlighted,figure.ilc_media_cont_MediaContainer"
for figure in self._soup.select(selector):
video_element = figure.select_one("video")
if not video_element:
_unexpected_html_warning()
log.warn_contd(f"No <video> element found for mob video '{title}'")
continue
url = None
for source in video_element.select("source"):
if source.get("type", "") == "video/mp4":
url = cast(Optional[str], source.get("src"))
break
url, title = self._find_mob_video_url_title(video_element, figure)
if url is None:
_unexpected_html_warning()
log.warn_contd(f"No <source> element found for mob video '{title}'")
continue
if urlparse(url).hostname != urlparse(self._page_url).hostname:
log.explain(f"Found external video at {url}, ignoring")
continue
videos.append(IliasPageElement.create_new(
typ=IliasElementType.MOB_VIDEO,
url=self._abs_url_from_relative(url),
@ -879,18 +1134,26 @@ class IliasPage:
return videos
def _find_mediacast_video_mtime(self, enclosing_td: Tag) -> Optional[datetime]:
description_td = cast(Tag, enclosing_td.find_previous_sibling("td"))
if not description_td:
return None
def _find_mob_video_url_title(self, video_element: Tag, figure: Tag) -> tuple[Optional[str], str]:
url = None
for source in video_element.select("source"):
if source.get("type", "") == "video/mp4":
url = cast(Optional[str], source.get("src"))
break
meta_tag = cast(Optional[Tag], description_td.find_all("p")[-1])
if not meta_tag:
return None
if url is None and video_element.get("src"):
url = cast(Optional[str], video_element.get("src"))
updated_str = meta_tag.get_text().strip().replace("\n", " ")
updated_str = re.sub(".+?: ", "", updated_str)
return demangle_date(updated_str)
fig_caption = cast(Optional[Tag], figure.select_one("figcaption"))
if fig_caption:
title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4"
elif url is not None:
path = urlparse(self._abs_url_from_relative(url)).path
title = path.rsplit("/", 1)[-1]
else:
title = f"unknown video {figure}"
return url, title
def _is_in_expanded_meeting(self, tag: Tag) -> bool:
"""
@ -907,12 +1170,17 @@ class IliasPage:
# We should not crawl files under meetings
if "ilContainerListItemContentCB" in cast(str, parent.get("class")):
link: Tag = parent.parent.find("a") # type: ignore
type = IliasPage._find_type_from_folder_like(link, self._page_url)
return type == IliasElementType.MEETING
typ = IliasPage._find_type_for_element(
"meeting",
self._abs_url_from_link(link),
lambda: IliasPage._find_icon_for_folder_entry(link)
)
return typ == IliasElementType.MEETING
return False
def _find_upwards_folder_hierarchy(self, tag: Tag) -> list[str]:
@staticmethod
def _find_upwards_folder_hierarchy(tag: Tag) -> list[str]:
"""
Interprets accordions and expandable blocks as virtual folders and returns them
in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test"
@ -953,13 +1221,16 @@ class IliasPage:
if outer_accordion_content:
accordion_tag = cast(Tag, outer_accordion_content.parent)
head_tag = cast(Tag, accordion_tag.find(attrs={
"class": lambda x: x is not None and "ilc_va_ihead_VAccordIHead" in x
"class": lambda x: x is not None and (
"ilc_va_ihead_VAccordIHead" in x or "ilc_va_ihead_AccordIHead" in x
)
}))
found_titles.append(head_tag.get_text().strip())
return [_sanitize_path_name(x) for x in reversed(found_titles)]
def _find_link_description(self, link: Tag) -> Optional[str]:
@staticmethod
def _find_link_description(link: Tag) -> Optional[str]:
tile = cast(
Tag,
link.find_parent("div", {"class": lambda x: x is not None and "il_ContainerListItem" in x})
@ -974,7 +1245,8 @@ class IliasPage:
return None
return description_element.get_text().strip()
def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement:
@staticmethod
def _file_to_element(name: str, url: str, link_element: Tag) -> IliasPageElement:
# Files have a list of properties (type, modification date, size, etc.)
# In a series of divs.
# Find the parent containing all those divs, so we can filter our what we need
@ -1007,27 +1279,38 @@ class IliasPage:
for title in card_titles:
url = self._abs_url_from_link(title)
name = _sanitize_path_name(title.get_text().strip())
type = self._find_type_from_card(title)
typ = IliasPage._find_type_for_element(
name, url, lambda: IliasPage._find_icon_from_card(title)
)
if not type:
if not typ:
_unexpected_html_warning()
log.warn_contd(f"Could not extract type for {title}")
continue
result.append(IliasPageElement.create_new(type, url, name))
result.append(IliasPageElement.create_new(typ, url, name))
card_button_tiles: list[Tag] = self._soup.select(".card-title button")
for button in card_button_tiles:
regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") # type: ignore
res = regex.search(str(self._soup))
if not res:
signal_regex = re.compile("#" + str(button["id"]) + r"[\s\S]*?\.trigger\('(.+?)'")
signal_match = signal_regex.search(str(self._soup))
if not signal_match:
_unexpected_html_warning()
log.warn_contd(f"Could not find click handler target for {button}")
log.warn_contd(f"Could not find click handler signal for {button}")
continue
url = self._abs_url_from_relative(res.group(1))
signal = signal_match.group(1)
open_regex = re.compile(r"\.on\('" + signal + r"[\s\S]*?window.open\(['\"](.+?)['\"]")
open_match = open_regex.search(str(self._soup))
if not open_match:
_unexpected_html_warning()
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
continue
url = self._abs_url_from_relative(open_match.group(1))
name = _sanitize_path_name(button.get_text().strip())
type = self._find_type_from_card(button)
typ = IliasPage._find_type_for_element(
name, url, lambda: IliasPage._find_icon_from_card(button)
)
caption_parent = cast(Tag, button.find_parent(
"div",
attrs={"class": lambda x: x is not None and "caption" in x},
@ -1038,143 +1321,59 @@ class IliasPage:
else:
description = None
if not type:
if not typ:
_unexpected_html_warning()
log.warn_contd(f"Could not extract type for {button}")
continue
result.append(IliasPageElement.create_new(type, url, name, description=description))
result.append(IliasPageElement.create_new(typ, url, name, description=description))
return result
def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]:
def is_card_root(element: Tag) -> bool:
return "il-card" in element["class"] and "thumbnail" in element["class"]
card_root: Optional[Tag] = None
# We look for the card root
for parent in card_title.parents:
if is_card_root(parent):
card_root = parent
break
if card_root is None:
_unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
return None
icon = cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
if "opencast" in icon["class"] or "xoct" in icon["class"]:
return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED
if "exc" in icon["class"]:
return IliasElementType.EXERCISE
if "grp" in icon["class"]:
return IliasElementType.FOLDER
if "webr" in icon["class"]:
return IliasElementType.LINK
if "book" in icon["class"]:
return IliasElementType.BOOKING
if "crsr" in icon["class"]:
return IliasElementType.COURSE
if "frm" in icon["class"]:
return IliasElementType.FORUM
if "sess" in icon["class"]:
return IliasElementType.MEETING
if "tst" in icon["class"]:
return IliasElementType.TEST
if "fold" in icon["class"]:
return IliasElementType.FOLDER
if "copa" in icon["class"]:
return IliasElementType.FOLDER
if "svy" in icon["class"]:
return IliasElementType.SURVEY
if "file" in icon["class"]:
return IliasElementType.FILE
if "mcst" in icon["class"]:
return IliasElementType.MEDIACAST_VIDEO_FOLDER
_unexpected_html_warning()
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
return None
@staticmethod
def _find_type_from_link(
def _find_type_for_element(
element_name: str,
link_element: Tag,
url: str
url: str,
icon_for_element: Callable[[], Optional[Tag]],
) -> Optional[IliasElementType]:
"""
Decides which sub crawler to use for a given top level element.
"""
parsed_url = urlparse(url)
icon = icon_for_element()
# file URLs contain "target=file"
if "target=file_" in parsed_url.query:
return IliasElementType.FILE
def try_matcher(matcher: IliasElementMatcher) -> bool:
match matcher:
case TypeMatcher.All(matchers=ms):
return all(try_matcher(m) for m in ms)
case TypeMatcher.Any(matchers=ms):
return any(try_matcher(m) for m in ms)
case TypeMatcher.ImgAlt(alt=alt):
return icon is not None and alt in str(icon["alt"]).lower()
case TypeMatcher.ImgSrc(src=src):
return icon is not None and src in str(icon["src"]).lower()
case TypeMatcher.UrlPath(path=path):
return path in parsed_url.path.lower()
case TypeMatcher.UrlParameter(query=query):
return query in parsed_url.query.lower()
if "target=grp_" in parsed_url.query:
return IliasElementType.FOLDER
raise CrawlError(f"Unknown matcher {matcher}")
if "target=crs_" in parsed_url.query:
return IliasElementType.FOLDER
if "baseClass=ilExerciseHandlerGUI" in parsed_url.query:
return IliasElementType.EXERCISE
if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query:
return IliasElementType.LINK
if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query:
return IliasElementType.FORUM
if "cmdClass=ilobjtestgui" in parsed_url.query:
return IliasElementType.TEST
if "baseClass=ilLMPresentationGUI" in parsed_url.query:
return IliasElementType.LEARNING_MODULE
if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
return IliasElementType.MEDIACAST_VIDEO_FOLDER
if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
return IliasElementType.SCORM_LEARNING_MODULE
# other universities might have content type specified in URL path
if "_file_" in parsed_url.path:
return IliasElementType.FILE
if "_fold_" in parsed_url.path or "_copa_" in parsed_url.path:
return IliasElementType.FOLDER
if "_frm_" in parsed_url.path:
return IliasElementType.FORUM
if "_exc_" in parsed_url.path:
return IliasElementType.EXERCISE
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
# try to guess it from the image.
# Everything with a ref_id can *probably* be opened to reveal nested things
# video groups, directories, exercises, etc
if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path:
return IliasPage._find_type_from_folder_like(link_element, url)
for typ in IliasElementType:
if try_matcher(typ.matcher()):
return typ
_unexpected_html_warning()
log.warn_contd(
f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})"
)
log.warn_contd(f"Tried to figure out element type, but failed for {element_name!r} / {url!r})")
if "ref_id=" in parsed_url.query.lower() or "goto.php" in parsed_url.path.lower():
log.warn_contd("Defaulting to FOLDER as it contains a ref_id/goto")
return IliasElementType.FOLDER
return None
@staticmethod
def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]:
"""
Try crawling something that looks like a folder.
"""
# pylint: disable=too-many-return-statements
def _find_icon_for_folder_entry(link_element: Tag) -> Optional[Tag]:
found_parent: Optional[Tag] = None
# We look for the outer div of our inner link, to find information around it
@ -1186,7 +1385,9 @@ class IliasPage:
if found_parent is None:
_unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}")
log.warn_contd(
f"Tried to figure out element type, but did not find an icon for {link_element!r}"
)
return None
# Find the small descriptive icon to figure out the type
@ -1203,42 +1404,35 @@ class IliasPage:
log.explain("Found session expansion button, skipping it as it has no content")
return None
if img_tag is None:
_unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
return None
if img_tag is not None:
return img_tag
if "opencast" in str(img_tag["alt"]).lower():
return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED
if str(img_tag["src"]).endswith("icon_exc.svg"):
return IliasElementType.EXERCISE
if str(img_tag["src"]).endswith("icon_webr.svg"):
return IliasElementType.LINK
if str(img_tag["src"]).endswith("icon_book.svg"):
return IliasElementType.BOOKING
if str(img_tag["src"]).endswith("frm.svg"):
return IliasElementType.FORUM
if str(img_tag["src"]).endswith("sess.svg"):
return IliasElementType.MEETING
if str(img_tag["src"]).endswith("icon_tst.svg"):
return IliasElementType.TEST
if str(img_tag["src"]).endswith("icon_mcst.svg"):
return IliasElementType.MEDIACAST_VIDEO_FOLDER
if str(img_tag["src"]).endswith("icon_sahs.svg"):
return IliasElementType.SCORM_LEARNING_MODULE
return IliasElementType.FOLDER
log.explain(f"Tried to figure out element type, but did not find an image for {link_element!r}")
return None
@staticmethod
def is_logged_in(soup: BeautifulSoup) -> bool:
def _find_icon_from_card(card_title: Tag) -> Optional[Tag]:
def is_card_root(element: Tag) -> bool:
return "il-card" in element["class"] and "thumbnail" in element["class"]
card_root: Optional[Tag] = None
# We look for the card root
for parent in card_title.parents:
if is_card_root(parent):
card_root = parent
break
if card_root is None:
_unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
return None
return cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
@staticmethod
def is_logged_in(ilias_soup: IliasSoup) -> bool:
soup = ilias_soup.soup
# Normal ILIAS pages
mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
if mainbar is not None:
@ -1285,7 +1479,7 @@ class IliasPage:
return None
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
return IliasPage.get_soup_permalink(self._ilias_soup)
def _abs_url_from_link(self, link_tag: Tag) -> str:
"""
@ -1300,11 +1494,15 @@ class IliasPage:
return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element = cast(Tag, soup.select_one(".il-footer-permanent-url > a"))
if not perma_link_element or not perma_link_element.get("href"):
return None
return cast(Optional[str], perma_link_element.get("href"))
def get_soup_permalink(ilias_soup: IliasSoup) -> Optional[str]:
scripts = cast(list[Tag], ilias_soup.soup.find_all("script"))
pattern = re.compile(r"il\.Footer\.permalink\.copyText\(\"(.+?)\"\)")
for script in scripts:
if match := pattern.search(script.text):
url = match.group(1)
url = url.replace(r"\/", "/")
return url
return None
def _unexpected_html_warning() -> None: