Compare commits

...

10 Commits

Author SHA1 Message Date
dd2fedf1a2 Fix crawling of exercises with instructions
We do not want a second path and the instruction field has an identical
link...
2025-04-29 18:30:33 +02:00
77a23265a9 Bump version to 3.8.2 2025-04-29 17:55:57 +02:00
4c230ef6dd Fix exercise crawling 2025-04-25 13:45:57 +02:00
b305e1ce23 Fix login using the native ilias login form 2025-04-23 16:08:45 +02:00
bdf17f5c87 Ignore wikis 2025-04-23 16:03:37 +02:00
77fce7daf8 Bump version to 3.8.1 2025-04-17 11:22:35 +02:00
653bf139f0 Fix encoding of descriptions and force images to light mode 2025-04-16 10:52:18 +02:00
3f60638d33 Bump version to 3.8.0 2025-04-16 00:47:05 +02:00
b97b6fae6b Update minimum Python version to 3.11 2025-04-15 21:35:20 +02:00
477234ad0d Support ILIAS 9 2025-04-15 21:35:20 +02:00
13 changed files with 725 additions and 533 deletions

View File

@ -14,7 +14,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest, windows-latest, macos-13, macos-latest] os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
python: ["3.9"] python: ["3.11"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4

View File

@ -22,8 +22,34 @@ ambiguous situations.
## Unreleased ## Unreleased
## Fixed
- Crawling of exercises with instructions
## 3.8.2 - 2025-04-29
## Changed
- Explicitly mention that wikis are not supported at the moment and ignore them
## Fixed
- Ilias-native login
- Exercise crawling
## 3.8.1 - 2025-04-17
## Fixed
- Description html files now specify at UTF-8 encoding
- Images in descriptions now always have a white background
## 3.8.0 - 2025-04-16
### Added
- Support for ILIAS 9
### Changed ### Changed
- Added prettier CSS to forum threads - Added prettier CSS to forum threads
- Downloaded forum threads now link to the forum instead of the ILIAS thread
- Increase minimum supported Python version to 3.11
- Do not crawl nested courses (courses linked in other courses)
## Fixed ## Fixed
- File links in report on Windows - File links in report on Windows

View File

@ -149,9 +149,7 @@ class CrawlerSection(Section):
return self.s.getboolean("skip", fallback=False) return self.s.getboolean("skip", fallback=False)
def output_dir(self, name: str) -> Path: def output_dir(self, name: str) -> Path:
# TODO Use removeprefix() after switching to 3.9 name = name.removeprefix("crawl:")
if name.startswith("crawl:"):
name = name[len("crawl:"):]
return Path(self.s.get("output_dir", name)).expanduser() return Path(self.s.get("output_dir", name)).expanduser()
def redownload(self) -> Redownload: def redownload(self) -> Redownload:

View File

@ -39,6 +39,10 @@ _STYLE_TAG_CONTENT = """
margin: 0.5rem 0; margin: 0.5rem 0;
} }
img {
background-color: white;
}
body { body {
padding: 1em; padding: 1em;
grid-template-columns: 1fr min(60rem, 90%) 1fr; grid-template-columns: 1fr min(60rem, 90%) 1fr;
@ -56,12 +60,11 @@ _ARTICLE_WORTHY_CLASSES = [
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
head = soup.new_tag("head") head = soup.new_tag("head")
soup.insert(0, head) soup.insert(0, head)
# Force UTF-8 encoding
head.append(soup.new_tag("meta", charset="utf-8"))
simplecss_link: Tag = soup.new_tag("link")
# <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
simplecss_link["rel"] = "stylesheet" head.append(soup.new_tag("link", rel="stylesheet", href="https://cdn.simplecss.org/simple.css"))
simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
head.append(simplecss_link)
# Basic style tags for compat # Basic style tags for compat
style: Tag = soup.new_tag("style") style: Tag = soup.new_tag("style")

View File

@ -22,7 +22,7 @@ from .async_helper import _iorepeat
from .file_templates import Links, forum_thread_template, learning_module_template from .file_templates import Links, forum_thread_template, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) IliasPageElement, IliasSoup, _sanitize_path_name, parse_ilias_forum_export)
from .shibboleth_login import ShibbolethLogin from .shibboleth_login import ShibbolethLogin
TargetType = Union[str, int] TargetType = Union[str, int]
@ -105,9 +105,9 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = { _DIRECTORY_PAGES: Set[IliasElementType] = {
IliasElementType.COURSE,
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES, IliasElementType.EXERCISE_FILES,
IliasElementType.EXERCISE_OVERVIEW,
IliasElementType.FOLDER, IliasElementType.FOLDER,
IliasElementType.INFO_TAB, IliasElementType.INFO_TAB,
IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.MEDIACAST_VIDEO_FOLDER,
@ -217,11 +217,19 @@ instance's greatest bottleneck.
async def _crawl_desktop(self) -> None: async def _crawl_desktop(self) -> None:
await self._crawl_url( await self._crawl_url(
urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show") urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show"),
crawl_nested_courses=True
) )
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: async def _crawl_url(
if awaitable := await self._handle_ilias_page(url, None, PurePath("."), expected_id): self,
url: str,
expected_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> None:
if awaitable := await self._handle_ilias_page(
url, None, PurePath("."), expected_id, crawl_nested_courses
):
await awaitable await awaitable
async def _handle_ilias_page( async def _handle_ilias_page(
@ -230,6 +238,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
path: PurePath, path: PurePath,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path) maybe_cl = await self.crawl(path)
if not maybe_cl: if not maybe_cl:
@ -237,7 +246,9 @@ instance's greatest bottleneck.
if current_element: if current_element:
self._ensure_not_seen(current_element, path) self._ensure_not_seen(current_element, path)
return self._crawl_ilias_page(url, current_element, maybe_cl, expected_course_id) return self._crawl_ilias_page(
url, current_element, maybe_cl, expected_course_id, crawl_nested_courses
)
@anoncritical @anoncritical
async def _crawl_ilias_page( async def _crawl_ilias_page(
@ -246,6 +257,7 @@ instance's greatest bottleneck.
current_element: Optional[IliasPageElement], current_element: Optional[IliasPageElement],
cl: CrawlToken, cl: CrawlToken,
expected_course_id: Optional[int] = None, expected_course_id: Optional[int] = None,
crawl_nested_courses: bool = False,
) -> None: ) -> None:
elements: List[IliasPageElement] = [] elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes # A list as variable redefinitions are not propagated to outer scopes
@ -267,12 +279,12 @@ instance's greatest bottleneck.
# If we expect to find a root course, enforce it # If we expect to find a root course, enforce it
if current_parent is None and expected_course_id is not None: if current_parent is None and expected_course_id is not None:
perma_link = IliasPage.get_soup_permalink(soup) perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link or "crs_" not in perma_link: if not perma_link or "crs/" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course") raise CrawlError("Invalid course id? Didn't find anything looking like a course")
if str(expected_course_id) not in perma_link: if str(expected_course_id) not in perma_link:
raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}") raise CrawlError(f"Expected course id {expected_course_id} but got {perma_link}")
page = IliasPage(soup, next_stage_url, current_parent) page = IliasPage(soup, current_parent)
if next_element := page.get_next_stage_element(): if next_element := page.get_next_stage_element():
current_parent = next_element current_parent = next_element
next_stage_url = next_element.url next_stage_url = next_element.url
@ -294,7 +306,7 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for element in elements: for element in elements:
if handle := await self._handle_ilias_element(cl.path, element): if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses):
tasks.append(asyncio.create_task(handle)) tasks.append(asyncio.create_task(handle))
# And execute them # And execute them
@ -310,6 +322,7 @@ instance's greatest bottleneck.
self, self,
parent_path: PurePath, parent_path: PurePath,
element: IliasPageElement, element: IliasPageElement,
crawl_nested_courses: bool = False
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
# element.name might contain `/` if the crawler created nested elements, # element.name might contain `/` if the crawler created nested elements,
# so we can not sanitize it here. We trust in the output dir to thwart worst-case # so we can not sanitize it here. We trust in the output dir to thwart worst-case
@ -362,6 +375,64 @@ instance's greatest bottleneck.
"[bright_black](scorm learning modules are not supported)" "[bright_black](scorm learning modules are not supported)"
) )
return None return None
elif element.type == IliasElementType.LITERATURE_LIST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](literature lists are not currently supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE_HTML:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](HTML learning modules are not supported)"
)
return None
elif element.type == IliasElementType.BLOG:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](blogs are not currently supported)"
)
return None
elif element.type == IliasElementType.DCL_RECORD_LIST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](dcl record lists are not currently supported)"
)
return None
elif element.type == IliasElementType.MEDIA_POOL:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](media pools are not currently supported)"
)
return None
elif element.type == IliasElementType.COURSE:
if crawl_nested_courses:
return await self._handle_ilias_page(element.url, element, element_path)
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](not descending into linked course)"
)
return None
elif element.type == IliasElementType.WIKI:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](wikis are not currently supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE: elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path) return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
@ -590,7 +661,7 @@ instance's greatest bottleneck.
) )
async with dl as (bar, sink): async with dl as (bar, sink):
page = IliasPage(await self._get_page(element.url), element.url, element) page = IliasPage(await self._get_page(element.url), element)
stream_elements = page.get_child_elements() stream_elements = page.get_child_elements()
if len(stream_elements) > 1: if len(stream_elements) > 1:
@ -600,7 +671,7 @@ instance's greatest bottleneck.
stream_element = stream_elements[0] stream_element = stream_elements[0]
# We do not have a local cache yet # We do not have a local cache yet
await self._stream_from_url(stream_element.url, sink, bar, is_video=True) await self._stream_from_url(stream_element, sink, bar, is_video=True)
add_to_report([str(self._transformer.transform(dl.path))]) add_to_report([str(self._transformer.transform(dl.path))])
return return
@ -615,7 +686,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
log.explain(f"Streaming video from real url {stream_element.url}") log.explain(f"Streaming video from real url {stream_element.url}")
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path))) contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
await self._stream_from_url(stream_element.url, sink, bar, is_video=True) await self._stream_from_url(stream_element, sink, bar, is_video=True)
add_to_report(contained_video_paths) add_to_report(contained_video_paths)
@ -637,12 +708,19 @@ instance's greatest bottleneck.
async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None: async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None:
assert dl # The function is only reached when dl is not None assert dl # The function is only reached when dl is not None
async with dl as (bar, sink): async with dl as (bar, sink):
await self._stream_from_url(element.url, sink, bar, is_video) await self._stream_from_url(element, sink, bar, is_video)
async def _stream_from_url(
self,
element: IliasPageElement,
sink: FileSink,
bar: ProgressBar,
is_video: bool
) -> None:
url = element.url
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool: async def try_stream() -> bool:
next_url = url next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML, # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the # we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
@ -690,7 +768,7 @@ instance's greatest bottleneck.
await self.authenticate(auth_id) await self.authenticate(auth_id)
if not await try_stream(): if not await try_stream():
raise CrawlError("File streaming failed after authenticate()") raise CrawlError(f"File streaming failed after authenticate() {element!r}")
async def _handle_forum( async def _handle_forum(
self, self,
@ -705,70 +783,23 @@ instance's greatest bottleneck.
@_iorepeat(3, "crawling forum") @_iorepeat(3, "crawling forum")
@anoncritical @anoncritical
async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements: List[IliasForumThread] = []
async with cl: async with cl:
next_stage_url = element.url inner = IliasPage(await self._get_page(element.url), element)
page = None export_url = inner.get_forum_export_url()
if not export_url:
while next_stage_url: log.warn("Could not extract forum export url")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url)
page = IliasPage(soup, next_stage_url, element)
if next := page.get_next_stage_element():
next_stage_url = next.url
else:
break
forum_threads: list[tuple[IliasPageElement, bool]] = []
for entry in cast(IliasPage, page).get_forum_entries():
path = cl.path / (_sanitize_path_name(entry.name) + ".html")
forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
# Sort the ids. The forum download will *preserve* this ordering
forum_threads.sort(key=lambda elem: elem[0].id())
if not forum_threads:
log.explain("Forum had no threads")
return return
download_data = cast(IliasPage, page).get_download_forum_data( export = await self._post(export_url, {
[thread.id() for thread, download in forum_threads if download] "format": "html",
) "cmd[createExportFile]": ""
if not download_data: })
raise CrawlWarning("Failed to extract forum data")
if not download_data.empty: elements = parse_ilias_forum_export(soupify(export))
html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html))
else:
elements = []
# Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
# download in the correct order, potentially messing up duplication handling.
expected_element_titles = [thread.name for thread, download in forum_threads if download]
actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
if expected_element_titles != actual_element_titles:
raise CrawlWarning(
f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
)
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for thread, download in forum_threads: for thread in elements:
if download: tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url)))
# This only works because ILIAS keeps the order in the export
elem = elements.pop(0)
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem, thread)))
else:
# We only downloaded the threads we "should_try_download"ed. This can be an
# over-approximation and all will be fine.
# If we selected too few, e.g. because there was a duplicate title and the mtime of the
# original is newer than the update of the duplicate.
# This causes stale data locally, but I consider this problem acceptable right now.
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, thread)))
# And execute them # And execute them
await self.gather(tasks) await self.gather(tasks)
@ -779,7 +810,7 @@ instance's greatest bottleneck.
self, self,
parent_path: PurePath, parent_path: PurePath,
thread: Union[IliasForumThread, IliasPageElement], thread: Union[IliasForumThread, IliasPageElement],
element: IliasPageElement forum_url: str
) -> None: ) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html") path = parent_path / (_sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime) maybe_dl = await self.download(path, mtime=thread.mtime)
@ -789,7 +820,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
rendered = forum_thread_template( rendered = forum_thread_template(
thread.name, thread.name,
element.url, forum_url,
thread.name_tag, thread.name_tag,
await self.internalize_images(thread.content_tag) await self.internalize_images(thread.content_tag)
) )
@ -817,7 +848,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {element.url}") log.explain(f"URL: {element.url}")
soup = await self._get_page(element.url) soup = await self._get_page(element.url)
page = IliasPage(soup, element.url, element) page = IliasPage(soup, element)
if next := page.get_learning_module_data(): if next := page.get_learning_module_data():
elements.extend(await self._crawl_learning_module_direction( elements.extend(await self._crawl_learning_module_direction(
cl.path, next.previous_url, "left", element cl.path, next.previous_url, "left", element
@ -860,7 +891,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
log.explain(f"URL: {next_element_url}") log.explain(f"URL: {next_element_url}")
soup = await self._get_page(next_element_url) soup = await self._get_page(next_element_url)
page = IliasPage(soup, next_element_url, parent_element) page = IliasPage(soup, parent_element)
if next := page.get_learning_module_data(): if next := page.get_learning_module_data():
elements.append(next) elements.append(next)
if dir == "left": if dir == "left":
@ -891,13 +922,13 @@ instance's greatest bottleneck.
if prev: if prev:
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
if prev_p: if prev_p:
prev = os.path.relpath(prev_p, my_path.parent) prev = cast(str, os.path.relpath(prev_p, my_path.parent))
else: else:
prev = None prev = None
if next: if next:
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
if next_p: if next_p:
next = os.path.relpath(next_p, my_path.parent) next = cast(str, os.path.relpath(next_p, my_path.parent))
else: else:
next = None next = None
@ -937,10 +968,10 @@ instance's greatest bottleneck.
) )
self._visited_urls[element.url] = parent_path self._visited_urls[element.url] = parent_path
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: async def _get_page(self, url: str, root_page_allowed: bool = False) -> IliasSoup:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
async with self.session.get(url) as request: async with self.session.get(url) as request:
soup = soupify(await request.read()) soup = IliasSoup(soupify(await request.read()), str(request.url))
if IliasPage.is_logged_in(soup): if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed) return self._verify_page(soup, url, root_page_allowed)
@ -949,13 +980,13 @@ instance's greatest bottleneck.
# Retry once after authenticating. If this fails, we will die. # Retry once after authenticating. If this fails, we will die.
async with self.session.get(url) as request: async with self.session.get(url) as request:
soup = soupify(await request.read()) soup = IliasSoup(soupify(await request.read()), str(request.url))
if IliasPage.is_logged_in(soup): if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed) return self._verify_page(soup, url, root_page_allowed)
raise CrawlError(f"get_page failed even after authenticating on {url!r}") raise CrawlError(f"get_page failed even after authenticating on {url!r}")
@staticmethod @staticmethod
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: def _verify_page(soup: IliasSoup, url: str, root_page_allowed: bool) -> IliasSoup:
if IliasPage.is_root_page(soup) and not root_page_allowed: if IliasPage.is_root_page(soup) and not root_page_allowed:
raise CrawlError( raise CrawlError(
"Unexpectedly encountered ILIAS root page. " "Unexpectedly encountered ILIAS root page. "
@ -967,29 +998,19 @@ instance's greatest bottleneck.
) )
return soup return soup
async def _post_authenticated( async def _post(
self, self,
url: str, url: str,
data: dict[str, Union[str, List[str]]] data: dict[str, Union[str, List[str]]]
) -> bytes: ) -> bytes:
auth_id = await self._current_auth_id()
form_data = aiohttp.FormData() form_data = aiohttp.FormData()
for key, val in data.items(): for key, val in data.items():
form_data.add_field(key, val) form_data.add_field(key, val)
async with self.session.post(url, data=form_data(), allow_redirects=False) as request: async with self.session.post(url, data=form_data()) as request:
if request.status == 200: if request.status == 200:
return await request.read() return await request.read()
raise CrawlError(f"post failed with status {request.status}")
# We weren't authenticated, so try to do that
await self.authenticate(auth_id)
# Retry once after authenticating. If this fails, we will die.
async with self.session.post(url, data=data, allow_redirects=False) as request:
if request.status == 200:
return await request.read()
raise CrawlError("post_authenticated failed even after authenticating")
async def _get_authenticated(self, url: str) -> bytes: async def _get_authenticated(self, url: str) -> bytes:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
@ -1019,7 +1040,7 @@ instance's greatest bottleneck.
async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request:
login_page = soupify(await request.read()) login_page = soupify(await request.read())
login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"})) login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "login_form"}))
if login_form is None: if login_form is None:
raise CrawlError("Could not find the login form! Specified client id might be invalid.") raise CrawlError("Could not find the login form! Specified client id might be invalid.")
@ -1029,42 +1050,12 @@ instance's greatest bottleneck.
username, password = await self._auth.credentials() username, password = await self._auth.credentials()
login_data = { login_form_data = aiohttp.FormData()
"username": username, login_form_data.add_field('login_form/input_3/input_4', username)
"password": password, login_form_data.add_field('login_form/input_3/input_5', password)
"cmd[doStandardAuthentication]": "Login",
}
# do the actual login # do the actual login
async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request: async with self.session.post(urljoin(self._base_url, login_url), data=login_form_data) as request:
soup = soupify(await request.read()) soup = IliasSoup(soupify(await request.read()), str(request.url))
if not self._is_logged_in(soup): if not IliasPage.is_logged_in(soup):
self._auth.invalidate_credentials() self._auth.invalidate_credentials()
@staticmethod
def _is_logged_in(soup: BeautifulSoup) -> bool:
# Normal ILIAS pages
mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar"))
if mainbar is not None:
login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x})
shib_login = soup.find(id="button_shib_login")
return not login_button and not shib_login
# Personal Desktop
if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}):
return True
# Video listing embeds do not have complete ILIAS html. Try to match them by
# their video listing table
video_table = soup.find(
recursive=True,
name="table",
attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
)
if video_table is not None:
return True
# The individual video player wrapper page has nothing of the above.
# Match it by its playerContainer.
if soup.select_one("#playerContainer") is not None:
return True
return False

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,8 @@
import asyncio import asyncio
import sys import sys
import traceback import traceback
from contextlib import asynccontextmanager, contextmanager from contextlib import AbstractContextManager, asynccontextmanager, contextmanager
# TODO In Python 3.9 and above, ContextManager is deprecated from typing import AsyncIterator, Iterator, List, Optional
from typing import AsyncIterator, ContextManager, Iterator, List, Optional
from rich.console import Console, Group from rich.console import Console, Group
from rich.live import Live from rich.live import Live
@ -261,7 +260,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
action: str, action: str,
text: str, text: str,
total: Optional[float] = None, total: Optional[float] = None,
) -> ContextManager[ProgressBar]: ) -> AbstractContextManager[ProgressBar]:
""" """
Allows markup in the "style" argument which will be applied to the Allows markup in the "style" argument which will be applied to the
"action" string. "action" string.
@ -277,7 +276,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
action: str, action: str,
text: str, text: str,
total: Optional[float] = None, total: Optional[float] = None,
) -> ContextManager[ProgressBar]: ) -> AbstractContextManager[ProgressBar]:
""" """
Allows markup in the "style" argument which will be applied to the Allows markup in the "style" argument which will be applied to the
"action" string. "action" string.

View File

@ -34,15 +34,6 @@ class MarkConflictError(Exception):
self.collides_with = collides_with self.collides_with = collides_with
# TODO Use PurePath.is_relative_to when updating to 3.9
def is_relative_to(a: PurePath, b: PurePath) -> bool:
try:
a.relative_to(b)
return True
except ValueError:
return False
class Report: class Report:
""" """
A report of a synchronization. Includes all files found by the crawler, as A report of a synchronization. Includes all files found by the crawler, as
@ -173,7 +164,7 @@ class Report:
if path == other: if path == other:
raise MarkDuplicateError(path) raise MarkDuplicateError(path)
if is_relative_to(path, other) or is_relative_to(other, path): if path.is_relative_to(other) or other.is_relative_to(path):
raise MarkConflictError(path, other) raise MarkConflictError(path, other)
self.known_files.add(path) self.known_files.add(path)

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.7.0" VERSION = "3.8.2"

View File

@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the
### With pip ### With pip
Ensure you have at least Python 3.9 installed. Run the following command to Ensure you have at least Python 3.11 installed. Run the following command to
install PFERD or upgrade it to the latest version: install PFERD or upgrade it to the latest version:
``` ```

8
flake.lock generated
View File

@ -2,16 +2,16 @@
"nodes": { "nodes": {
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1708979614, "lastModified": 1744440957,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=", "narHash": "sha256-FHlSkNqFmPxPJvy+6fNLaNeWnF1lZSgqVCl/eWaJRc4=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a", "rev": "26d499fc9f1d567283d5d56fcf367edd815dba1d",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixos-23.11", "ref": "nixos-24.11",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }

View File

@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS"; description = "Tool for downloading course-related files from ILIAS";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11"; nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
}; };
outputs = { self, nixpkgs }: outputs = { self, nixpkgs }:

View File

@ -12,7 +12,7 @@ dependencies = [
"certifi>=2021.10.8" "certifi>=2021.10.8"
] ]
dynamic = ["version"] dynamic = ["version"]
requires-python = ">=3.9" requires-python = ">=3.11"
[project.scripts] [project.scripts]
pferd = "PFERD.__main__:main" pferd = "PFERD.__main__:main"