Add compatibility with ILIAS 8

This commit is contained in:
I-Al-Istannen
2024-04-05 19:06:54 +02:00
parent ab0cb2d956
commit eb0c956d32
3 changed files with 46 additions and 40 deletions

View File

@ -24,6 +24,7 @@ ambiguous situations.
### Fixed ### Fixed
- Video name deduplication - Video name deduplication
- Compatibility with ILIAS 8
## 3.5.0 - 2023-09-13 ## 3.5.0 - 2023-09-13

View File

@ -17,7 +17,7 @@ TargetType = Union[str, int]
class IliasElementType(Enum): class IliasElementType(Enum):
EXERCISE = "exercise" EXERCISE = "exercise"
EXERCISE_FILES = "exercise_files" # own submitted files EXERCISE_FILES = "exercise_files" # own submitted files
TEST = "test" # an online test. Will be ignored currently. TEST = "test" # an online test. Will be ignored currently.
FILE = "file" FILE = "file"
FOLDER = "folder" FOLDER = "folder"
FORUM = "forum" FORUM = "forum"
@ -95,13 +95,9 @@ class IliasPage:
@staticmethod @staticmethod
def is_root_page(soup: BeautifulSoup) -> bool: def is_root_page(soup: BeautifulSoup) -> bool:
permalink = soup.find(id="current_perma_link") if permalink := IliasPage.get_soup_permalink(soup):
if permalink is None: return "goto.php?target=root_" in permalink
return False return False
value = permalink.attrs.get("value")
if value is None:
return False
return "goto.php?target=root_" in value
def get_child_elements(self) -> List[IliasPageElement]: def get_child_elements(self) -> List[IliasPageElement]:
""" """
@ -279,16 +275,14 @@ class IliasPage:
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
def _is_content_page(self) -> bool: def _is_content_page(self) -> bool:
link = self._soup.find(id="current_perma_link") if link := self.get_permalink():
if not link: return "target=copa_" in link
return False return False
return "target=copa_" in link.get("value")
def _is_learning_module_page(self) -> bool: def _is_learning_module_page(self) -> bool:
link = self._soup.find(id="current_perma_link") if link := self.get_permalink():
if not link: return "target=pg_" in link
return False return False
return "target=pg_" in link.get("value")
def _contains_collapsed_future_meetings(self) -> bool: def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None return self._uncollapse_future_meetings_url() is not None
@ -513,8 +507,8 @@ class IliasPage:
modification_string = link.parent.parent.parent.select_one( modification_string = link.parent.parent.parent.select_one(
f"td.std:nth-child({index})" f"td.std:nth-child({index})"
).getText().strip() ).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
break break
if modification_time is None: if modification_time is None:
@ -613,7 +607,7 @@ class IliasPage:
file_listings: List[Tag] = container.findAll( file_listings: List[Tag] = container.findAll(
name="a", name="a",
# download links contain the given command class # download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
) )
# Add each listing as a new # Add each listing as a new
@ -917,9 +911,9 @@ class IliasPage:
@staticmethod @staticmethod
def _find_type_from_link( def _find_type_from_link(
element_name: str, element_name: str,
link_element: Tag, link_element: Tag,
url: str url: str
) -> Optional[IliasElementType]: ) -> Optional[IliasElementType]:
""" """
Decides which sub crawler to use for a given top level element. Decides which sub crawler to use for a given top level element.
@ -1095,6 +1089,9 @@ class IliasPage:
return True return True
return False return False
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, link_tag: Tag) -> str:
""" """
Create an absolute url from an <a> tag. Create an absolute url from an <a> tag.
@ -1107,6 +1104,13 @@ class IliasPage:
""" """
return urljoin(self._page_url, relative_url) return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
if not perma_link_element or not perma_link_element.get("href"):
return None
return perma_link_element.get("href")
def _unexpected_html_warning() -> None: def _unexpected_html_warning() -> None:
log.warn("Encountered unexpected HTML structure, ignoring element.") log.warn("Encountered unexpected HTML structure, ignoring element.")
@ -1130,7 +1134,7 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
date_str = date_str.strip() date_str = date_str.strip()
for german, english in zip(german_months, english_months): for german, english in zip(german_months, english_months):
date_str = date_str.replace(german, english) date_str = date_str.replace(german, english)

View File

@ -12,17 +12,17 @@ import yarl
from aiohttp import hdrs from aiohttp import hdrs
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from .file_templates import Links, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from ...auth import Authenticator, TfaAuthenticator from ...auth import Authenticator, TfaAuthenticator
from ...config import Config from ...config import Config
from ...logging import ProgressBar, log from ...logging import ProgressBar, log
from ...output_dir import FileSink, Redownload from ...output_dir import FileSink, Redownload
from ...utils import fmt_path, soupify, url_set_query_param from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import Links, learning_module_template
from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
TargetType = Union[str, int] TargetType = Union[str, int]
@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
raise CrawlError("Impossible return in ilias _iorepeat") raise CrawlError("Impossible return in ilias _iorepeat")
return wrapper # type: ignore return wrapper # type: ignore
return decorator return decorator
@ -177,11 +178,11 @@ def _get_video_cache_key(element: IliasPageElement) -> str:
class KitIliasWebCrawler(HttpCrawler): class KitIliasWebCrawler(HttpCrawler):
def __init__( def __init__(
self, self,
name: str, name: str,
section: KitIliasWebCrawlerSection, section: KitIliasWebCrawlerSection,
config: Config, config: Config,
authenticators: Dict[str, Authenticator] authenticators: Dict[str, Authenticator]
): ):
# Setting a main authenticator for cookie sharing # Setting a main authenticator for cookie sharing
auth = section.auth(authenticators) auth = section.auth(authenticators)
@ -253,8 +254,8 @@ instance's greatest bottleneck.
soup = await self._get_page(next_stage_url, root_page_allowed=True) soup = await self._get_page(next_stage_url, root_page_allowed=True)
if current_parent is None and expected_id is not None: if current_parent is None and expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link") perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link_element or "crs_" not in perma_link_element.get("value"): if not perma_link or "crs_" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course") raise CrawlError("Invalid course id? Didn't find anything looking like a course")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@ -677,7 +678,7 @@ instance's greatest bottleneck.
async with self.session.get(url, allow_redirects=is_video) as resp: async with self.session.get(url, allow_redirects=is_video) as resp:
if not is_video: if not is_video:
# Redirect means we weren't authenticated # Redirect means we weren't authenticated
if hdrs.LOCATION in resp.headers: if hdrs.LOCATION in resp.headers and "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
return False return False
# we wanted a video but got HTML # we wanted a video but got HTML
if is_video and "html" in resp.content_type: if is_video and "html" in resp.content_type:
@ -1052,9 +1053,9 @@ class KitShibbolethLogin:
await sess.post(url, data=data) await sess.post(url, data=data)
async def _authenticate_tfa( async def _authenticate_tfa(
self, self,
session: aiohttp.ClientSession, session: aiohttp.ClientSession,
soup: BeautifulSoup soup: BeautifulSoup
) -> BeautifulSoup: ) -> BeautifulSoup:
if not self._tfa_auth: if not self._tfa_auth:
self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") self._tfa_auth = TfaAuthenticator("ilias-anon-tfa")