Compare commits

...

3 Commits

Author SHA1 Message Date
7291382430 Bump version to 3.8.0 2025-04-15 11:32:22 +02:00
1a430ad5d1 Update minimum Python version to 3.11 2025-04-15 11:31:39 +02:00
f6bdeb6b9d Support ILIAS 9 2025-04-15 11:19:53 +02:00
12 changed files with 529 additions and 311 deletions

View File

@ -14,7 +14,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest, windows-latest, macos-13, macos-latest] os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
python: ["3.9"] python: ["3.11"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4

View File

@ -22,8 +22,14 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.8.0 - 2025-04-15
### Added
- Support for ILIAS 9
### Changed ### Changed
- Added prettier CSS to forum threads - Added prettier CSS to forum threads
- Increase minimum supported Python version to 3.11
## Fixed ## Fixed
- File links in report on Windows - File links in report on Windows

View File

@ -149,9 +149,7 @@ class CrawlerSection(Section):
return self.s.getboolean("skip", fallback=False) return self.s.getboolean("skip", fallback=False)
def output_dir(self, name: str) -> Path: def output_dir(self, name: str) -> Path:
# TODO Use removeprefix() after switching to 3.9 name = name.removeprefix("crawl:")
if name.startswith("crawl:"):
name = name[len("crawl:"):]
return Path(self.s.get("output_dir", name)).expanduser() return Path(self.s.get("output_dir", name)).expanduser()
def redownload(self) -> Redownload: def redownload(self) -> Redownload:

View File

@ -22,7 +22,7 @@ from .async_helper import _iorepeat
from .file_templates import Links, forum_thread_template, learning_module_template from .file_templates import Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
from .shibboleth_login import ShibbolethLogin from .shibboleth_login import ShibbolethLogin
TargetType = Union[str, int] TargetType = Union[str, int]
@ -105,7 +105,6 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = { _DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.COURSE,
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES, IliasElementType.EXERCISE_FILES,
IliasElementType.FOLDER, IliasElementType.FOLDER,
@ -267,12 +266,12 @@ instance's greatest bottleneck.
# If we expect to find a root course, enforce it # If we expect to find a root course, enforce it
if current_parent is None and expected_course_id is not None: if current_parent is None and expected_course_id is not None:
perma_link = IliasPage.get_soup_permalink(soup) perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link or "crs_" not in perma_link: if not perma_link or "crs/" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course") raise CrawlError("Invalid course id? Didn't find anything looking like a course")
if str(expected_course_id) not in perma_link: if str(expected_course_id) not in perma_link:
raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}") raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}")
page = IliasPage(soup, next_stage_url, current_parent) page = IliasPage(soup, current_parent)
if next_element := page.get_next_stage_element(): if next_element := page.get_next_stage_element():
current_parent = next_element current_parent = next_element
next_stage_url = next_element.url next_stage_url = next_element.url
@ -362,6 +361,54 @@ instance's greatest bottleneck.
"[bright_black](scorm learning modules are not supported)" "[bright_black](scorm learning modules are not supported)"
) )
return None return None
elif element.type == IliasElementType.LITERATURE_LIST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](literature lists are not currently supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE_HTML:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](HTML learning modules are not supported)"
)
return None
elif element.type == IliasElementType.BLOG:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](blogs are not currently supported)"
)
return None
elif element.type == IliasElementType.DCL_RECORD_LIST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](dcl record lists are not currently supported)"
)
return None
elif element.type == IliasElementType.MEDIA_POOL:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](media pools are not currently supported)"
)
return None
elif element.type == IliasElementType.COURSE:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](not descending into linked course, download it separately)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE: elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path) return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
@ -590,7 +637,7 @@ instance's greatest bottleneck.
) )
async with dl as (bar, sink): async with dl as (bar, sink):
page = IliasPage(await self._get_page(element.url), element.url, element) page = IliasPage(await self._get_page(element.url), element)
stream_elements = page.get_child_elements() stream_elements = page.get_child_elements()
if len(stream_elements) > 1: if len(stream_elements) > 1:
@ -600,7 +647,7 @@ instance's greatest bottleneck.
stream_element = stream_elements[0] stream_element = stream_elements[0]
# We do not have a local cache yet # We do not have a local cache yet
await self._stream_from_url(stream_element.url, sink, bar, is_video=True) await self._stream_from_url(stream_element, sink, bar, is_video=True)
add_to_report([str(self._transformer.transform(dl.path))]) add_to_report([str(self._transformer.transform(dl.path))])
return return
@ -615,7 +662,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
log.explain(f"Streaming video from real url {stream_element.url}") log.explain(f"Streaming video from real url {stream_element.url}")
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path))) contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
await self._stream_from_url(stream_element.url, sink, bar, is_video=True) await self._stream_from_url(stream_element, sink, bar, is_video=True)
add_to_report(contained_video_paths) add_to_report(contained_video_paths)
@ -637,12 +684,19 @@ instance's greatest bottleneck.
async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None: async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None:
assert dl # The function is only reached when dl is not None assert dl # The function is only reached when dl is not None
async with dl as (bar, sink): async with dl as (bar, sink):
await self._stream_from_url(element.url, sink, bar, is_video) await self._stream_from_url(element, sink, bar, is_video)
async def _stream_from_url(
self,
element: IliasPageElement,
sink: FileSink,
bar: ProgressBar,
is_video: bool
) -> None:
url = element.url
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool: async def try_stream() -> bool:
next_url = url next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML, # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the # we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
@ -690,7 +744,7 @@ instance's greatest bottleneck.
await self.authenticate(auth_id) await self.authenticate(auth_id)
if not await try_stream(): if not await try_stream():
raise CrawlError("File streaming failed after authenticate()") raise CrawlError(f"File streaming failed after authenticate() {element!r}")
async def _handle_forum( async def _handle_forum(
self, self,
@ -716,7 +770,7 @@ instance's greatest bottleneck.
log.explain(f"URL: {next_stage_url}") log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url) soup = await self._get_page(next_stage_url)
page = IliasPage(soup, next_stage_url, element) page = IliasPage(soup, element)
if next := page.get_next_stage_element(): if next := page.get_next_stage_element():
next_stage_url = next.url next_stage_url = next.url
@ -817,7 +871,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {element.url}") log.explain(f"URL: {element.url}")
soup = await self._get_page(element.url) soup = await self._get_page(element.url)
page = IliasPage(soup, element.url, element) page = IliasPage(soup, element)
if next := page.get_learning_module_data(): if next := page.get_learning_module_data():
elements.extend(await self._crawl_learning_module_direction( elements.extend(await self._crawl_learning_module_direction(
cl.path, next.previous_url, "left", element cl.path, next.previous_url, "left", element
@ -860,7 +914,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
log.explain(f"URL: {next_element_url}") log.explain(f"URL: {next_element_url}")
soup = await self._get_page(next_element_url) soup = await self._get_page(next_element_url)
page = IliasPage(soup, next_element_url, parent_element) page = IliasPage(soup, parent_element)
if next := page.get_learning_module_data(): if next := page.get_learning_module_data():
elements.append(next) elements.append(next)
if dir == "left": if dir == "left":
@ -891,13 +945,13 @@ instance's greatest bottleneck.
if prev: if prev:
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
if prev_p: if prev_p:
prev = os.path.relpath(prev_p, my_path.parent) prev = cast(str, os.path.relpath(prev_p, my_path.parent))
else: else:
prev = None prev = None
if next: if next:
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
if next_p: if next_p:
next = os.path.relpath(next_p, my_path.parent) next = cast(str, os.path.relpath(next_p, my_path.parent))
else: else:
next = None next = None
@ -937,10 +991,10 @@ instance's greatest bottleneck.
) )
self._visited_urls[element.url] = parent_path self._visited_urls[element.url] = parent_path
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: async def _get_page(self, url: str, root_page_allowed: bool = False) -> IliasSoup:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
async with self.session.get(url) as request: async with self.session.get(url) as request:
soup = soupify(await request.read()) soup = IliasSoup(soupify(await request.read()), str(request.url))
if IliasPage.is_logged_in(soup): if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed) return self._verify_page(soup, url, root_page_allowed)
@ -949,13 +1003,13 @@ instance's greatest bottleneck.
# Retry once after authenticating. If this fails, we will die. # Retry once after authenticating. If this fails, we will die.
async with self.session.get(url) as request: async with self.session.get(url) as request:
soup = soupify(await request.read()) soup = IliasSoup(soupify(await request.read()), str(request.url))
if IliasPage.is_logged_in(soup): if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed) return self._verify_page(soup, url, root_page_allowed)
raise CrawlError(f"get_page failed even after authenticating on {url!r}") raise CrawlError(f"get_page failed even after authenticating on {url!r}")
@staticmethod @staticmethod
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: def _verify_page(soup: IliasSoup, url: str, root_page_allowed: bool) -> IliasSoup:
if IliasPage.is_root_page(soup) and not root_page_allowed: if IliasPage.is_root_page(soup) and not root_page_allowed:
raise CrawlError( raise CrawlError(
"Unexpectedly encountered ILIAS root page. " "Unexpectedly encountered ILIAS root page. "
@ -1037,34 +1091,6 @@ instance's greatest bottleneck.
# do the actual login # do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request: async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request:
soup = soupify(await request.read()) soup = IliasSoup(soupify(await request.read()), str(request.url))
if not self._is_logged_in(soup): if not IliasPage.is_logged_in(soup):
self._auth.invalidate_credentials() self._auth.invalidate_credentials()
@staticmethod
def _is_logged_in(soup: BeautifulSoup) -> bool:
# Normal ILIAS pages
mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
if mainbar is not None:
login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x})
shib_login = soup.find(id="button_shib_login")
return not login_button and not shib_login
# Personal Desktop
if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}):
return True
# Video listing embeds do not have complete ILIAS html. Try to match them by
# their video listing table
video_table = soup.find(
recursive=True,
name="table",
attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
)
if video_table is not None:
return True
# The individual video player wrapper page has nothing of the above.
# Match it by its playerContainer.
if soup.select_one("#playerContainer") is not None:
return True
return False

View File

@ -3,20 +3,100 @@ import re
from dataclasses import dataclass from dataclasses import dataclass
from datetime import date, datetime, timedelta from datetime import date, datetime, timedelta
from enum import Enum from enum import Enum
from typing import Dict, Optional, Union, cast from typing import Callable, Dict, Optional, Union, cast
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from PFERD.crawl import CrawlError
from PFERD.crawl.crawler import CrawlWarning
from PFERD.logging import log from PFERD.logging import log
from PFERD.utils import url_set_query_params from PFERD.utils import url_set_query_params
TargetType = Union[str, int] TargetType = Union[str, int]
class TypeMatcher:
class UrlPath:
path: str
def __init__(self, path: str):
self.path = path
class UrlParameter:
query: str
def __init__(self, query: str):
self.query = query
class ImgSrc:
src: str
def __init__(self, src: str):
self.src = src
class ImgAlt:
alt: str
def __init__(self, alt: str):
self.alt = alt
class All:
matchers: list['IliasElementMatcher']
def __init__(self, matchers: list['IliasElementMatcher']):
self.matchers = matchers
class Any:
matchers: list['IliasElementMatcher']
def __init__(self, matchers: list['IliasElementMatcher']):
self.matchers = matchers
@staticmethod
def path(path: str) -> UrlPath:
return TypeMatcher.UrlPath(path)
@staticmethod
def query(query: str) -> UrlParameter:
return TypeMatcher.UrlParameter(query)
@staticmethod
def img_src(src: str) -> ImgSrc:
return TypeMatcher.ImgSrc(src)
@staticmethod
def img_alt(alt: str) -> ImgAlt:
return TypeMatcher.ImgAlt(alt)
@staticmethod
def all(*matchers: 'IliasElementMatcher') -> All:
return TypeMatcher.All(list(matchers))
@staticmethod
def any(*matchers: 'IliasElementMatcher') -> Any:
return TypeMatcher.Any(list(matchers))
@staticmethod
def never() -> Any:
return TypeMatcher.Any([])
IliasElementMatcher = (
TypeMatcher.UrlPath
| TypeMatcher.UrlParameter
| TypeMatcher.ImgSrc
| TypeMatcher.ImgAlt
| TypeMatcher.All
| TypeMatcher.Any
)
class IliasElementType(Enum): class IliasElementType(Enum):
BLOG = "blog"
BOOKING = "booking" BOOKING = "booking"
COURSE = "course" COURSE = "course"
DCL_RECORD_LIST = "dcl_record_list"
EXERCISE = "exercise" EXERCISE = "exercise"
EXERCISE_FILES = "exercise_files" # own submitted files EXERCISE_FILES = "exercise_files" # own submitted files
FILE = "file" FILE = "file"
@ -25,7 +105,10 @@ class IliasElementType(Enum):
FORUM_THREAD = "forum_thread" FORUM_THREAD = "forum_thread"
INFO_TAB = "info_tab" INFO_TAB = "info_tab"
LEARNING_MODULE = "learning_module" LEARNING_MODULE = "learning_module"
LEARNING_MODULE_HTML = "learning_module_html"
LITERATURE_LIST = "literature_list"
LINK = "link" LINK = "link"
MEDIA_POOL = "media_pool"
MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO = "mediacast_video"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
MEETING = "meeting" MEETING = "meeting"
@ -38,6 +121,131 @@ class IliasElementType(Enum):
SURVEY = "survey" SURVEY = "survey"
TEST = "test" # an online test. Will be ignored currently. TEST = "test" # an online test. Will be ignored currently.
def matcher(self) -> IliasElementMatcher:
match self:
case IliasElementType.BLOG:
return TypeMatcher.any(
TypeMatcher.img_src("_blog.svg")
)
case IliasElementType.BOOKING:
return TypeMatcher.any(
TypeMatcher.path("/book/"),
TypeMatcher.img_src("_book.svg")
)
case IliasElementType.COURSE:
return TypeMatcher.any(TypeMatcher.path("/crs/"), TypeMatcher.img_src("_crsr.svg"))
case IliasElementType.DCL_RECORD_LIST:
return TypeMatcher.any(
TypeMatcher.img_src("_dcl.svg"),
TypeMatcher.query("cmdclass=ildclrecordlistgui")
)
case IliasElementType.EXERCISE:
return TypeMatcher.any(
TypeMatcher.path("/exc/"),
TypeMatcher.path("_exc_"),
TypeMatcher.img_src("_exc.svg"),
)
case IliasElementType.EXERCISE_FILES:
return TypeMatcher.never()
case IliasElementType.FILE:
return TypeMatcher.any(
TypeMatcher.query("cmd=sendfile"),
TypeMatcher.path("_file_"),
TypeMatcher.img_src("/filedelivery/"),
)
case IliasElementType.FOLDER:
return TypeMatcher.any(
TypeMatcher.path("/fold/"),
TypeMatcher.img_src("_fold.svg"),
TypeMatcher.path("/grp/"),
TypeMatcher.img_src("_grp.svg"),
TypeMatcher.path("/copa/"),
TypeMatcher.path("_copa_"),
TypeMatcher.img_src("_copa.svg"),
# Not supported right now but warn users
# TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
# TypeMatcher.img_alt("medienpool"),
# TypeMatcher.img_src("_mep.svg"),
)
case IliasElementType.FORUM:
return TypeMatcher.any(
TypeMatcher.path("/frm/"),
TypeMatcher.path("_frm_"),
TypeMatcher.img_src("_frm.svg"),
)
case IliasElementType.FORUM_THREAD:
return TypeMatcher.never()
case IliasElementType.INFO_TAB:
return TypeMatcher.never()
case IliasElementType.LITERATURE_LIST:
return TypeMatcher.img_src("_bibl.svg")
case IliasElementType.LEARNING_MODULE:
return TypeMatcher.any(
TypeMatcher.path("/lm/"),
TypeMatcher.img_src("_lm.svg")
)
case IliasElementType.LEARNING_MODULE_HTML:
return TypeMatcher.any(
TypeMatcher.query("baseclass=ilhtlmpresentationgui"),
TypeMatcher.img_src("_htlm.svg")
)
case IliasElementType.LINK:
return TypeMatcher.any(
TypeMatcher.all(
TypeMatcher.query("baseclass=illinkresourcehandlergui"),
TypeMatcher.query("calldirectlink"),
),
TypeMatcher.img_src("_webr.svg")
)
case IliasElementType.MEDIA_POOL:
return TypeMatcher.any(
TypeMatcher.query("baseclass=ilmediapoolpresentationgui"),
TypeMatcher.img_src("_mep.svg")
)
case IliasElementType.MEDIACAST_VIDEO:
return TypeMatcher.never()
case IliasElementType.MEDIACAST_VIDEO_FOLDER:
return TypeMatcher.any(
TypeMatcher.path("/mcst/"),
TypeMatcher.query("baseclass=ilmediacasthandlergui"),
TypeMatcher.img_src("_mcst.svg")
)
case IliasElementType.MEETING:
return TypeMatcher.any(
TypeMatcher.img_src("_sess.svg")
)
case IliasElementType.MOB_VIDEO:
return TypeMatcher.never()
case IliasElementType.OPENCAST_VIDEO:
return TypeMatcher.never()
case IliasElementType.OPENCAST_VIDEO_FOLDER:
return TypeMatcher.never()
case IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED:
return TypeMatcher.img_alt("opencast")
case IliasElementType.OPENCAST_VIDEO_PLAYER:
return TypeMatcher.never()
case IliasElementType.SCORM_LEARNING_MODULE:
return TypeMatcher.any(
TypeMatcher.query("baseclass=ilsahspresentationgui"),
TypeMatcher.img_src("_sahs.svg")
)
case IliasElementType.SURVEY:
return TypeMatcher.any(
TypeMatcher.path("/svy/"),
TypeMatcher.img_src("svy.svg")
)
case IliasElementType.TEST:
return TypeMatcher.any(
TypeMatcher.query("cmdclass=ilobjtestgui"),
TypeMatcher.query("cmdclass=iltestscreengui"),
TypeMatcher.img_src("_tst.svg")
)
raise CrawlWarning(f"Unknown matcher {self}")
@dataclass @dataclass
class IliasPageElement: class IliasPageElement:
@ -50,11 +258,20 @@ class IliasPageElement:
def id(self) -> str: def id(self) -> str:
regexes = [ regexes = [
r"eid=(?P<id>[0-9a-z\-]+)", r"eid=(?P<id>[0-9a-z\-]+)",
r"file_(?P<id>\d+)", r"book/(?P<id>\d+)", # booking
r"copa_(?P<id>\d+)", r"cat/(?P<id>\d+)",
r"fold_(?P<id>\d+)", r"copa/(?P<id>\d+)", # content page
r"frm_(?P<id>\d+)", r"crs/(?P<id>\d+)", # course
r"exc_(?P<id>\d+)", r"exc/(?P<id>\d+)", # exercise
r"file/(?P<id>\d+)", # file
r"fold/(?P<id>\d+)", # folder
r"frm/(?P<id>\d+)", # forum
r"grp/(?P<id>\d+)", # group
r"lm/(?P<id>\d+)", # learning module
r"mcst/(?P<id>\d+)", # mediacast
r"pg/(?P<id>(\d|_)+)", # page?
r"svy/(?P<id>\d+)", # survey
r"webr/(?P<id>\d+)", # web referene (link)
r"thr_pk=(?P<id>\d+)", # forums r"thr_pk=(?P<id>\d+)", # forums
r"ref_id=(?P<id>\d+)", r"ref_id=(?P<id>\d+)",
r"target=[a-z]+_(?P<id>\d+)", r"target=[a-z]+_(?P<id>\d+)",
@ -139,18 +356,28 @@ class IliasLearningModulePage:
previous_url: Optional[str] previous_url: Optional[str]
class IliasSoup:
soup: BeautifulSoup
page_url: str
def __init__(self, soup: BeautifulSoup, page_url: str):
self.soup = soup
self.page_url = page_url
class IliasPage: class IliasPage:
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): def __init__(self, ilias_soup: IliasSoup, source_element: Optional[IliasPageElement]):
self._soup = soup self._ilias_soup = ilias_soup
self._page_url = _page_url self._soup = ilias_soup.soup
self._page_url = ilias_soup.page_url
self._page_type = source_element.type if source_element else None self._page_type = source_element.type if source_element else None
self._source_name = source_element.name if source_element else "" self._source_name = source_element.name if source_element else ""
@staticmethod @staticmethod
def is_root_page(soup: BeautifulSoup) -> bool: def is_root_page(soup: IliasSoup) -> bool:
if permalink := IliasPage.get_soup_permalink(soup): if permalink := IliasPage.get_soup_permalink(soup):
return "goto.php?target=root_" in permalink return "goto.php/root/" in permalink
return False return False
def get_child_elements(self) -> list[IliasPageElement]: def get_child_elements(self) -> list[IliasPageElement]:
@ -193,7 +420,10 @@ class IliasPage:
def get_description(self) -> Optional[BeautifulSoup]: def get_description(self) -> Optional[BeautifulSoup]:
def is_interesting_class(name: str) -> bool: def is_interesting_class(name: str) -> bool:
return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] return name in [
"ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap",
"ilc_va_ihcap_AccordIHeadCap", "ilc_media_cont_MediaContainer"
]
paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class)) paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class))
if not paragraphs: if not paragraphs:
@ -206,6 +436,21 @@ class IliasPage:
for p in paragraphs: for p in paragraphs:
if p.find_parent(class_=is_interesting_class): if p.find_parent(class_=is_interesting_class):
continue continue
if "ilc_media_cont_MediaContainer" in p["class"]:
# We have an embedded video which should be downloaded by _find_mob_videos
if video := p.select_one("video"):
url, title = self._find_mob_video_url_title(video, p)
raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
raw_html += 'display: flex; justify-content: center; align-items: center;'
raw_html += ' margin: 0.5rem;">'
if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
if url.startswith("//"):
url = "https:" + url
raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
else:
raw_html += f"Video elided. Filename: '{title}'."
raw_html += "</div>\n"
continue
# Ignore special listings (like folder groupings) # Ignore special listings (like folder groupings)
if "ilc_section_Special" in p["class"]: if "ilc_section_Special" in p["class"]:
@ -336,7 +581,7 @@ class IliasPage:
def _is_forum_page(self) -> bool: def _is_forum_page(self) -> bool:
if perma_link := self.get_permalink(): if perma_link := self.get_permalink():
return "target=frm_" in perma_link return "/frm/" in perma_link
return False return False
def _is_video_player(self) -> bool: def _is_video_player(self) -> bool:
@ -378,7 +623,7 @@ class IliasPage:
def _is_content_page(self) -> bool: def _is_content_page(self) -> bool:
if link := self.get_permalink(): if link := self.get_permalink():
return "target=copa_" in link return "/copa/" in link
return False return False
def _is_learning_module_page(self) -> bool: def _is_learning_module_page(self) -> bool:
@ -513,19 +758,17 @@ class IliasPage:
# Configure button/link does not have anything interesting # Configure button/link does not have anything interesting
continue continue
type = self._find_type_from_link(name, link, url) typ = IliasPage._find_type_for_element(
if not type: name, url, lambda: IliasPage._find_icon_for_folder_entry(link)
)
if not typ:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Could not extract type for {link}") log.warn_contd(f"Could not extract type for {link}")
continue continue
log.explain(f"Found {name!r}") log.explain(f"Found {name!r} of type {typ}")
if type == IliasElementType.FILE and "_download" not in url: items.append(IliasPageElement.create_new(typ, url, name))
url = re.sub(r"(target=file_\d+)", r"\1_download", url)
log.explain("Rewired file URL to include download part")
items.append(IliasPageElement.create_new(type, url, name))
return items return items
@ -786,15 +1029,17 @@ class IliasPage:
for link in links: for link in links:
abs_url = self._abs_url_from_link(link) abs_url = self._abs_url_from_link(link)
# Make sure parents are sanitized. We do not want accidental parents # Make sure parents are sanitized. We do not want accidental parents
parents = [_sanitize_path_name(x) for x in self._find_upwards_folder_hierarchy(link)] parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
if parents: if parents:
element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text()) element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
else: else:
element_name = _sanitize_path_name(link.get_text()) element_name = _sanitize_path_name(link.get_text())
element_type = self._find_type_from_link(element_name, link, abs_url) element_type = IliasPage._find_type_for_element(
description = self._find_link_description(link) element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
)
description = IliasPage._find_link_description(link)
# The last meeting on every page is expanded by default. # The last meeting on every page is expanded by default.
# Its content is then shown inline *and* in the meeting page itself. # Its content is then shown inline *and* in the meeting page itself.
@ -805,10 +1050,10 @@ class IliasPage:
if not element_type: if not element_type:
continue continue
elif element_type == IliasElementType.FILE: elif element_type == IliasElementType.FILE:
result.append(self._file_to_element(element_name, abs_url, link)) result.append(IliasPage._file_to_element(element_name, abs_url, link))
continue continue
log.explain(f"Found {element_name!r}") log.explain(f"Found {element_name!r} of type {element_type}")
result.append(IliasPageElement.create_new( result.append(IliasPageElement.create_new(
element_type, element_type,
abs_url, abs_url,
@ -826,24 +1071,36 @@ class IliasPage:
def _find_mediacast_videos(self) -> list[IliasPageElement]: def _find_mediacast_videos(self) -> list[IliasPageElement]:
videos: list[IliasPageElement] = [] videos: list[IliasPageElement] = []
for elem in cast(list[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): regex = re.compile(r"il\.VideoPlaylist\.init.+?\[(.+?)], ")
element_name = _sanitize_path_name( for script in cast(list[Tag], self._soup.find_all("script")):
cast(Tag, elem.select_one(".ilPlayerPreviewDescription")).get_text().strip() for match in regex.finditer(script.text):
) try:
if not element_name.endswith(".mp4"): playlist = json.loads("[" + match.group(1) + "]")
# just to make sure it has some kinda-alrightish ending except json.JSONDecodeError:
element_name = element_name + ".mp4" log.warn("Could not decode playlist json")
video_element = cast(Optional[Tag], elem.find(name="video")) log.warn_contd(f"Playlist json: [{match.group(1)}]")
if not video_element:
_unexpected_html_warning()
log.warn_contd(f"No <video> element found for mediacast video '{element_name}'")
continue continue
for elem in playlist:
title = elem.get("title", None)
description = elem.get("description", None)
url = elem.get("resource", None)
if title is None or description is None or url is None:
log.explain(f"Mediacast json: {match.group(1)}")
log.warn("Mediacast video json was not complete")
if title is None:
log.warn_contd("Missing title")
if description is None:
log.warn_contd("Missing description")
if url is None:
log.warn_contd("Missing URL")
if not title.endswith(".mp4") and not title.endswith(".webm"):
# just to make sure it has some kinda-alrightish ending
title = title + ".mp4"
videos.append(IliasPageElement.create_new( videos.append(IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO, typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(cast(str, video_element.get("src"))), url=self._abs_url_from_relative(cast(str, url)),
name=element_name, name=_sanitize_path_name(title)
mtime=self._find_mediacast_video_mtime(cast(Tag, elem.find_parent(name="td")))
)) ))
return videos return videos
@ -851,25 +1108,23 @@ class IliasPage:
def _find_mob_videos(self) -> list[IliasPageElement]: def _find_mob_videos(self) -> list[IliasPageElement]:
videos: list[IliasPageElement] = [] videos: list[IliasPageElement] = []
for figure in self._soup.select("figure.ilc_media_cont_MediaContainerHighlighted"): selector = "figure.ilc_media_cont_MediaContainerHighlighted,figure.ilc_media_cont_MediaContainer"
title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4" for figure in self._soup.select(selector):
video_element = figure.select_one("video") video_element = figure.select_one("video")
if not video_element: if not video_element:
_unexpected_html_warning()
log.warn_contd(f"No <video> element found for mob video '{title}'")
continue continue
url = None url, title = self._find_mob_video_url_title(video_element, figure)
for source in video_element.select("source"):
if source.get("type", "") == "video/mp4":
url = cast(Optional[str], source.get("src"))
break
if url is None: if url is None:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"No <source> element found for mob video '{title}'") log.warn_contd(f"No <source> element found for mob video '{title}'")
continue continue
if urlparse(url).hostname != urlparse(self._page_url).hostname:
log.explain(f"Found external video at {url}, ignoring")
continue
videos.append(IliasPageElement.create_new( videos.append(IliasPageElement.create_new(
typ=IliasElementType.MOB_VIDEO, typ=IliasElementType.MOB_VIDEO,
url=self._abs_url_from_relative(url), url=self._abs_url_from_relative(url),
@ -879,18 +1134,26 @@ class IliasPage:
return videos return videos
def _find_mediacast_video_mtime(self, enclosing_td: Tag) -> Optional[datetime]: def _find_mob_video_url_title(self, video_element: Tag, figure: Tag) -> tuple[Optional[str], str]:
description_td = cast(Tag, enclosing_td.find_previous_sibling("td")) url = None
if not description_td: for source in video_element.select("source"):
return None if source.get("type", "") == "video/mp4":
url = cast(Optional[str], source.get("src"))
break
meta_tag = cast(Optional[Tag], description_td.find_all("p")[-1]) if url is None and video_element.get("src"):
if not meta_tag: url = cast(Optional[str], video_element.get("src"))
return None
updated_str = meta_tag.get_text().strip().replace("\n", " ") fig_caption = cast(Optional[Tag], figure.select_one("figcaption"))
updated_str = re.sub(".+?: ", "", updated_str) if fig_caption:
return demangle_date(updated_str) title = cast(Tag, figure.select_one("figcaption")).get_text().strip() + ".mp4"
elif url is not None:
path = urlparse(self._abs_url_from_relative(url)).path
title = path.rsplit("/", 1)[-1]
else:
title = f"unknown video {figure}"
return url, title
def _is_in_expanded_meeting(self, tag: Tag) -> bool: def _is_in_expanded_meeting(self, tag: Tag) -> bool:
""" """
@ -907,12 +1170,17 @@ class IliasPage:
# We should not crawl files under meetings # We should not crawl files under meetings
if "ilContainerListItemContentCB" in cast(str, parent.get("class")): if "ilContainerListItemContentCB" in cast(str, parent.get("class")):
link: Tag = parent.parent.find("a") # type: ignore link: Tag = parent.parent.find("a") # type: ignore
type = IliasPage._find_type_from_folder_like(link, self._page_url) typ = IliasPage._find_type_for_element(
return type == IliasElementType.MEETING "meeting",
self._abs_url_from_link(link),
lambda: IliasPage._find_icon_for_folder_entry(link)
)
return typ == IliasElementType.MEETING
return False return False
def _find_upwards_folder_hierarchy(self, tag: Tag) -> list[str]: @staticmethod
def _find_upwards_folder_hierarchy(tag: Tag) -> list[str]:
""" """
Interprets accordions and expandable blocks as virtual folders and returns them Interprets accordions and expandable blocks as virtual folders and returns them
in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test" in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test"
@ -953,13 +1221,16 @@ class IliasPage:
if outer_accordion_content: if outer_accordion_content:
accordion_tag = cast(Tag, outer_accordion_content.parent) accordion_tag = cast(Tag, outer_accordion_content.parent)
head_tag = cast(Tag, accordion_tag.find(attrs={ head_tag = cast(Tag, accordion_tag.find(attrs={
"class": lambda x: x is not None and "ilc_va_ihead_VAccordIHead" in x "class": lambda x: x is not None and (
"ilc_va_ihead_VAccordIHead" in x or "ilc_va_ihead_AccordIHead" in x
)
})) }))
found_titles.append(head_tag.get_text().strip()) found_titles.append(head_tag.get_text().strip())
return [_sanitize_path_name(x) for x in reversed(found_titles)] return [_sanitize_path_name(x) for x in reversed(found_titles)]
def _find_link_description(self, link: Tag) -> Optional[str]: @staticmethod
def _find_link_description(link: Tag) -> Optional[str]:
tile = cast( tile = cast(
Tag, Tag,
link.find_parent("div", {"class": lambda x: x is not None and "il_ContainerListItem" in x}) link.find_parent("div", {"class": lambda x: x is not None and "il_ContainerListItem" in x})
@ -974,7 +1245,8 @@ class IliasPage:
return None return None
return description_element.get_text().strip() return description_element.get_text().strip()
def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: @staticmethod
def _file_to_element(name: str, url: str, link_element: Tag) -> IliasPageElement:
# Files have a list of properties (type, modification date, size, etc.) # Files have a list of properties (type, modification date, size, etc.)
# In a series of divs. # In a series of divs.
# Find the parent containing all those divs, so we can filter our what we need # Find the parent containing all those divs, so we can filter our what we need
@ -1007,27 +1279,38 @@ class IliasPage:
for title in card_titles: for title in card_titles:
url = self._abs_url_from_link(title) url = self._abs_url_from_link(title)
name = _sanitize_path_name(title.get_text().strip()) name = _sanitize_path_name(title.get_text().strip())
type = self._find_type_from_card(title) typ = IliasPage._find_type_for_element(
name, url, lambda: IliasPage._find_icon_from_card(title)
)
if not type: if not typ:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Could not extract type for {title}") log.warn_contd(f"Could not extract type for {title}")
continue continue
result.append(IliasPageElement.create_new(type, url, name)) result.append(IliasPageElement.create_new(typ, url, name))
card_button_tiles: list[Tag] = self._soup.select(".card-title button") card_button_tiles: list[Tag] = self._soup.select(".card-title button")
for button in card_button_tiles: for button in card_button_tiles:
regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") # type: ignore signal_regex = re.compile("#" + str(button["id"]) + r"[\s\S]*?\.trigger\('(.+?)'")
res = regex.search(str(self._soup)) signal_match = signal_regex.search(str(self._soup))
if not res: if not signal_match:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Could not find click handler target for {button}") log.warn_contd(f"Could not find click handler signal for {button}")
continue continue
url = self._abs_url_from_relative(res.group(1)) signal = signal_match.group(1)
open_regex = re.compile(r"\.on\('" + signal + r"[\s\S]*?window.open\(['\"](.+?)['\"]")
open_match = open_regex.search(str(self._soup))
if not open_match:
_unexpected_html_warning()
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
continue
url = self._abs_url_from_relative(open_match.group(1))
name = _sanitize_path_name(button.get_text().strip()) name = _sanitize_path_name(button.get_text().strip())
type = self._find_type_from_card(button) typ = IliasPage._find_type_for_element(
name, url, lambda: IliasPage._find_icon_from_card(button)
)
caption_parent = cast(Tag, button.find_parent( caption_parent = cast(Tag, button.find_parent(
"div", "div",
attrs={"class": lambda x: x is not None and "caption" in x}, attrs={"class": lambda x: x is not None and "caption" in x},
@ -1038,143 +1321,59 @@ class IliasPage:
else: else:
description = None description = None
if not type: if not typ:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Could not extract type for {button}") log.warn_contd(f"Could not extract type for {button}")
continue continue
result.append(IliasPageElement.create_new(type, url, name, description=description)) result.append(IliasPageElement.create_new(typ, url, name, description=description))
return result return result
def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]:
def is_card_root(element: Tag) -> bool:
return "il-card" in element["class"] and "thumbnail" in element["class"]
card_root: Optional[Tag] = None
# We look for the card root
for parent in card_title.parents:
if is_card_root(parent):
card_root = parent
break
if card_root is None:
_unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
return None
icon = cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
if "opencast" in icon["class"] or "xoct" in icon["class"]:
return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED
if "exc" in icon["class"]:
return IliasElementType.EXERCISE
if "grp" in icon["class"]:
return IliasElementType.FOLDER
if "webr" in icon["class"]:
return IliasElementType.LINK
if "book" in icon["class"]:
return IliasElementType.BOOKING
if "crsr" in icon["class"]:
return IliasElementType.COURSE
if "frm" in icon["class"]:
return IliasElementType.FORUM
if "sess" in icon["class"]:
return IliasElementType.MEETING
if "tst" in icon["class"]:
return IliasElementType.TEST
if "fold" in icon["class"]:
return IliasElementType.FOLDER
if "copa" in icon["class"]:
return IliasElementType.FOLDER
if "svy" in icon["class"]:
return IliasElementType.SURVEY
if "file" in icon["class"]:
return IliasElementType.FILE
if "mcst" in icon["class"]:
return IliasElementType.MEDIACAST_VIDEO_FOLDER
_unexpected_html_warning()
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
return None
@staticmethod @staticmethod
def _find_type_from_link( def _find_type_for_element(
element_name: str, element_name: str,
link_element: Tag, url: str,
url: str icon_for_element: Callable[[], Optional[Tag]],
) -> Optional[IliasElementType]: ) -> Optional[IliasElementType]:
""" """
Decides which sub crawler to use for a given top level element. Decides which sub crawler to use for a given top level element.
""" """
parsed_url = urlparse(url) parsed_url = urlparse(url)
icon = icon_for_element()
# file URLs contain "target=file" def try_matcher(matcher: IliasElementMatcher) -> bool:
if "target=file_" in parsed_url.query: match matcher:
return IliasElementType.FILE case TypeMatcher.All(matchers=ms):
return all(try_matcher(m) for m in ms)
case TypeMatcher.Any(matchers=ms):
return any(try_matcher(m) for m in ms)
case TypeMatcher.ImgAlt(alt=alt):
return icon is not None and alt in str(icon["alt"]).lower()
case TypeMatcher.ImgSrc(src=src):
return icon is not None and src in str(icon["src"]).lower()
case TypeMatcher.UrlPath(path=path):
return path in parsed_url.path.lower()
case TypeMatcher.UrlParameter(query=query):
return query in parsed_url.query.lower()
if "target=grp_" in parsed_url.query: raise CrawlError(f"Unknown matcher {matcher}")
return IliasElementType.FOLDER
if "target=crs_" in parsed_url.query: for typ in IliasElementType:
return IliasElementType.FOLDER if try_matcher(typ.matcher()):
return typ
if "baseClass=ilExerciseHandlerGUI" in parsed_url.query:
return IliasElementType.EXERCISE
if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query:
return IliasElementType.LINK
if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query:
return IliasElementType.FORUM
if "cmdClass=ilobjtestgui" in parsed_url.query:
return IliasElementType.TEST
if "baseClass=ilLMPresentationGUI" in parsed_url.query:
return IliasElementType.LEARNING_MODULE
if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
return IliasElementType.MEDIACAST_VIDEO_FOLDER
if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
return IliasElementType.SCORM_LEARNING_MODULE
# other universities might have content type specified in URL path
if "_file_" in parsed_url.path:
return IliasElementType.FILE
if "_fold_" in parsed_url.path or "_copa_" in parsed_url.path:
return IliasElementType.FOLDER
if "_frm_" in parsed_url.path:
return IliasElementType.FORUM
if "_exc_" in parsed_url.path:
return IliasElementType.EXERCISE
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
# try to guess it from the image.
# Everything with a ref_id can *probably* be opened to reveal nested things
# video groups, directories, exercises, etc
if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path:
return IliasPage._find_type_from_folder_like(link_element, url)
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd( log.warn_contd(f"Tried to figure out element type, but failed for {element_name!r} / {url!r})")
f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})"
) if "ref_id=" in parsed_url.query.lower() or "goto.php" in parsed_url.path.lower():
log.warn_contd("Defaulting to FOLDER as it contains a ref_id/goto")
return IliasElementType.FOLDER
return None return None
@staticmethod @staticmethod
def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: def _find_icon_for_folder_entry(link_element: Tag) -> Optional[Tag]:
"""
Try crawling something that looks like a folder.
"""
# pylint: disable=too-many-return-statements
found_parent: Optional[Tag] = None found_parent: Optional[Tag] = None
# We look for the outer div of our inner link, to find information around it # We look for the outer div of our inner link, to find information around it
@ -1186,7 +1385,9 @@ class IliasPage:
if found_parent is None: if found_parent is None:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}") log.warn_contd(
f"Tried to figure out element type, but did not find an icon for {link_element!r}"
)
return None return None
# Find the small descriptive icon to figure out the type # Find the small descriptive icon to figure out the type
@ -1203,42 +1404,35 @@ class IliasPage:
log.explain("Found session expansion button, skipping it as it has no content") log.explain("Found session expansion button, skipping it as it has no content")
return None return None
if img_tag is None: if img_tag is not None:
_unexpected_html_warning() return img_tag
log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
log.explain(f"Tried to figure out element type, but did not find an image for {link_element!r}")
return None return None
if "opencast" in str(img_tag["alt"]).lower(): @staticmethod
return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED def _find_icon_from_card(card_title: Tag) -> Optional[Tag]:
def is_card_root(element: Tag) -> bool:
return "il-card" in element["class"] and "thumbnail" in element["class"]
if str(img_tag["src"]).endswith("icon_exc.svg"): card_root: Optional[Tag] = None
return IliasElementType.EXERCISE
if str(img_tag["src"]).endswith("icon_webr.svg"): # We look for the card root
return IliasElementType.LINK for parent in card_title.parents:
if is_card_root(parent):
card_root = parent
break
if str(img_tag["src"]).endswith("icon_book.svg"): if card_root is None:
return IliasElementType.BOOKING _unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}")
return None
if str(img_tag["src"]).endswith("frm.svg"): return cast(Tag, card_root.select_one(".il-card-repository-head .icon"))
return IliasElementType.FORUM
if str(img_tag["src"]).endswith("sess.svg"):
return IliasElementType.MEETING
if str(img_tag["src"]).endswith("icon_tst.svg"):
return IliasElementType.TEST
if str(img_tag["src"]).endswith("icon_mcst.svg"):
return IliasElementType.MEDIACAST_VIDEO_FOLDER
if str(img_tag["src"]).endswith("icon_sahs.svg"):
return IliasElementType.SCORM_LEARNING_MODULE
return IliasElementType.FOLDER
@staticmethod @staticmethod
def is_logged_in(soup: BeautifulSoup) -> bool: def is_logged_in(ilias_soup: IliasSoup) -> bool:
soup = ilias_soup.soup
# Normal ILIAS pages # Normal ILIAS pages
mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar")) mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
if mainbar is not None: if mainbar is not None:
@ -1285,7 +1479,7 @@ class IliasPage:
return None return None
def get_permalink(self) -> Optional[str]: def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup) return IliasPage.get_soup_permalink(self._ilias_soup)
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, link_tag: Tag) -> str:
""" """
@ -1300,11 +1494,15 @@ class IliasPage:
return urljoin(self._page_url, relative_url) return urljoin(self._page_url, relative_url)
@staticmethod @staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]: def get_soup_permalink(ilias_soup: IliasSoup) -> Optional[str]:
perma_link_element = cast(Tag, soup.select_one(".il-footer-permanent-url > a")) scripts = cast(list[Tag], ilias_soup.soup.find_all("script"))
if not perma_link_element or not perma_link_element.get("href"): pattern = re.compile(r"il\.Footer\.permalink\.copyText\(\"(.+?)\"\)")
for script in scripts:
if match := pattern.search(script.text):
url = match.group(1)
url = url.replace(r"\/", "/")
return url
return None return None
return cast(Optional[str], perma_link_element.get("href"))
def _unexpected_html_warning() -> None: def _unexpected_html_warning() -> None:

View File

@ -1,9 +1,8 @@
import asyncio import asyncio
import sys import sys
import traceback import traceback
from contextlib import asynccontextmanager, contextmanager from contextlib import AbstractContextManager, asynccontextmanager, contextmanager
# TODO In Python 3.9 and above, ContextManager is deprecated from typing import AsyncIterator, Iterator, List, Optional
from typing import AsyncIterator, ContextManager, Iterator, List, Optional
from rich.console import Console, Group from rich.console import Console, Group
from rich.live import Live from rich.live import Live
@ -261,7 +260,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
action: str, action: str,
text: str, text: str,
total: Optional[float] = None, total: Optional[float] = None,
) -> ContextManager[ProgressBar]: ) -> AbstractContextManager[ProgressBar]:
""" """
Allows markup in the "style" argument which will be applied to the Allows markup in the "style" argument which will be applied to the
"action" string. "action" string.
@ -277,7 +276,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
action: str, action: str,
text: str, text: str,
total: Optional[float] = None, total: Optional[float] = None,
) -> ContextManager[ProgressBar]: ) -> AbstractContextManager[ProgressBar]:
""" """
Allows markup in the "style" argument which will be applied to the Allows markup in the "style" argument which will be applied to the
"action" string. "action" string.

View File

@ -34,15 +34,6 @@ class MarkConflictError(Exception):
self.collides_with = collides_with self.collides_with = collides_with
# TODO Use PurePath.is_relative_to when updating to 3.9
def is_relative_to(a: PurePath, b: PurePath) -> bool:
try:
a.relative_to(b)
return True
except ValueError:
return False
class Report: class Report:
""" """
A report of a synchronization. Includes all files found by the crawler, as A report of a synchronization. Includes all files found by the crawler, as
@ -173,7 +164,7 @@ class Report:
if path == other: if path == other:
raise MarkDuplicateError(path) raise MarkDuplicateError(path)
if is_relative_to(path, other) or is_relative_to(other, path): if path.is_relative_to(other) or other.is_relative_to(path):
raise MarkConflictError(path, other) raise MarkConflictError(path, other)
self.known_files.add(path) self.known_files.add(path)

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.7.0" VERSION = "3.8.0"

View File

@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the
### With pip ### With pip
Ensure you have at least Python 3.9 installed. Run the following command to Ensure you have at least Python 3.11 installed. Run the following command to
install PFERD or upgrade it to the latest version: install PFERD or upgrade it to the latest version:
``` ```

8
flake.lock generated
View File

@ -2,16 +2,16 @@
"nodes": { "nodes": {
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1708979614, "lastModified": 1744440957,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=", "narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a", "rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixos-23.11", "ref": "nixos-24.11",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }

View File

@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS"; description = "Tool for downloading course-related files from ILIAS";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11"; nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
}; };
outputs = { self, nixpkgs }: outputs = { self, nixpkgs }:

View File

@ -12,7 +12,7 @@ dependencies = [
"certifi>=2021.10.8" "certifi>=2021.10.8"
] ]
dynamic = ["version"] dynamic = ["version"]
requires-python = ">=3.9" requires-python = ">=3.11"
[project.scripts] [project.scripts]
pferd = "PFERD.__main__:main" pferd = "PFERD.__main__:main"