diff --git a/CHANGELOG.md b/CHANGELOG.md index 0443d50..df4fcf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Video name deduplication +- Compatibility with ILIAS 8 ## 3.5.0 - 2023-09-13 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d23141f..0be6448 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -17,7 +17,7 @@ TargetType = Union[str, int] class IliasElementType(Enum): EXERCISE = "exercise" EXERCISE_FILES = "exercise_files" # own submitted files - TEST = "test" # an online test. Will be ignored currently. + TEST = "test" # an online test. Will be ignored currently. FILE = "file" FOLDER = "folder" FORUM = "forum" @@ -95,13 +95,9 @@ class IliasPage: @staticmethod def is_root_page(soup: BeautifulSoup) -> bool: - permalink = soup.find(id="current_perma_link") - if permalink is None: - return False - value = permalink.attrs.get("value") - if value is None: - return False - return "goto.php?target=root_" in value + if permalink := IliasPage.get_soup_permalink(soup): + return "goto.php?target=root_" in permalink + return False def get_child_elements(self) -> List[IliasPageElement]: """ @@ -279,16 +275,14 @@ class IliasPage: return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) def _is_content_page(self) -> bool: - link = self._soup.find(id="current_perma_link") - if not link: - return False - return "target=copa_" in link.get("value") + if link := self.get_permalink(): + return "target=copa_" in link + return False def _is_learning_module_page(self) -> bool: - link = self._soup.find(id="current_perma_link") - if not link: - return False - return "target=pg_" in link.get("value") + if link := self.get_permalink(): + return "target=pg_" in link + return False def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -513,8 +507,8 @@ class IliasPage: modification_string = link.parent.parent.parent.select_one( f"td.std:nth-child({index})" ).getText().strip() - if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): - modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string): + modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M") break if modification_time is None: @@ -613,7 +607,7 @@ class IliasPage: file_listings: List[Tag] = container.findAll( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} + attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()} ) # Add each listing as a new @@ -917,9 +911,9 @@ class IliasPage: @staticmethod def _find_type_from_link( - element_name: str, - link_element: Tag, - url: str + element_name: str, + link_element: Tag, + url: str ) -> Optional[IliasElementType]: """ Decides which sub crawler to use for a given top level element. @@ -1095,6 +1089,9 @@ class IliasPage: return True return False + def get_permalink(self) -> Optional[str]: + return IliasPage.get_soup_permalink(self._soup) + def _abs_url_from_link(self, link_tag: Tag) -> str: """ Create an absolute url from an tag. @@ -1107,6 +1104,13 @@ class IliasPage: """ return urljoin(self._page_url, relative_url) + @staticmethod + def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]: + perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a") + if not perma_link_element or not perma_link_element.get("href"): + return None + return perma_link_element.get("href") + def _unexpected_html_warning() -> None: log.warn("Encountered unexpected HTML structure, ignoring element.") @@ -1130,7 +1134,7 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) - date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) date_str = date_str.strip() for german, english in zip(german_months, english_months): date_str = date_str.replace(german, english) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ac1f10d..52de793 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -12,17 +12,17 @@ import yarl from aiohttp import hdrs from bs4 import BeautifulSoup, Tag +from .file_templates import Links, learning_module_template +from .ilias_html_cleaner import clean, insert_base_markup +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) +from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical +from ..http_crawler import HttpCrawler, HttpCrawlerSection from ...auth import Authenticator, TfaAuthenticator from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical -from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links, learning_module_template -from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, - IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla raise CrawlError("Impossible return in ilias _iorepeat") return wrapper # type: ignore + return decorator @@ -177,11 +178,11 @@ def _get_video_cache_key(element: IliasPageElement) -> str: class KitIliasWebCrawler(HttpCrawler): def __init__( - self, - name: str, - section: KitIliasWebCrawlerSection, - config: Config, - authenticators: Dict[str, Authenticator] + self, + name: str, + section: KitIliasWebCrawlerSection, + config: Config, + authenticators: Dict[str, Authenticator] ): # Setting a main authenticator for cookie sharing auth = section.auth(authenticators) @@ -253,8 +254,8 @@ instance's greatest bottleneck. soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): + perma_link = IliasPage.get_soup_permalink(soup) + if not perma_link or "crs_" not in perma_link: raise CrawlError("Invalid course id? Didn't find anything looking like a course") log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") @@ -677,7 +678,7 @@ instance's greatest bottleneck. async with self.session.get(url, allow_redirects=is_video) as resp: if not is_video: # Redirect means we weren't authenticated - if hdrs.LOCATION in resp.headers: + if hdrs.LOCATION in resp.headers and "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]: return False # we wanted a video but got HTML if is_video and "html" in resp.content_type: @@ -1052,9 +1053,9 @@ class KitShibbolethLogin: await sess.post(url, data=data) async def _authenticate_tfa( - self, - session: aiohttp.ClientSession, - soup: BeautifulSoup + self, + session: aiohttp.ClientSession, + soup: BeautifulSoup ) -> BeautifulSoup: if not self._tfa_auth: self._tfa_auth = TfaAuthenticator("ilias-anon-tfa")