diff --git a/PFERD/auth/keyring.py b/PFERD/auth/keyring.py index c14f6fb..02a9269 100644 --- a/PFERD/auth/keyring.py +++ b/PFERD/auth/keyring.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple +from typing import Optional, Tuple, cast import keyring @@ -13,7 +13,7 @@ class KeyringAuthSection(AuthSection): return self.s.get("username") def keyring_name(self) -> str: - return self.s.get("keyring_name", fallback=NAME) + return cast(str, self.s.get("keyring_name", fallback=NAME)) class KeyringAuthenticator(Authenticator): diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 2cc97e1..1c4631c 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -3,7 +3,7 @@ import http.cookies import ssl from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import aiohttp import certifi @@ -187,12 +187,12 @@ class HttpCrawler(Crawler): if level == 0 or (level == 1 and drop_h1): return PurePath() - level_heading = tag.find_previous(name=f"h{level}") + level_heading = cast(Optional[Tag], tag.find_previous(name=f"h{level}")) if level_heading is None: return find_associated_headings(tag, level - 1) - folder_name = level_heading.getText().strip() + folder_name = level_heading.get_text().strip() return find_associated_headings(level_heading, level - 1) / folder_name # start at level

because paragraph-level headings are usually too granular for folder names @@ -231,6 +231,7 @@ class HttpCrawler(Crawler): etag_header = resp.headers.get("ETag") last_modified_header = resp.headers.get("Last-Modified") + last_modified = None if last_modified_header: try: diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index b206461..0a72199 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Optional +from typing import Optional, cast import bs4 @@ -139,13 +139,13 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next """ if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): - text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + text = cast(bs4.Tag, body.select_one(".ilc_page_lnav_LeftNavigation")).get_text().strip() left = f'{text}' else: left = "" if next and body.select_one(".ilc_page_rnav_RightNavigation"): - text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + text = cast(bs4.Tag, body.select_one(".ilc_page_rnav_RightNavigation")).get_text().strip() right = f'{text}' else: right = "" @@ -160,8 +160,8 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next "{{left}}", left).replace("{{right}}", right).encode()) ) - body = body.prettify() - return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + body_str = cast(str, body.prettify()) + return _learning_module_template.replace("{{body}}", body_str).replace("{{name}}", name) class Links(Enum): diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index e82906f..fb35bc0 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -1,3 +1,5 @@ +from typing import cast + from bs4 import BeautifulSoup, Comment, Tag _STYLE_TAG_CONTENT = """ @@ -70,18 +72,18 @@ def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: def clean(soup: BeautifulSoup) -> BeautifulSoup: - for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): + for block in cast(list[Tag], soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES)): block.name = "article" - for block in soup.find_all("h3"): + for block in cast(list[Tag], soup.find_all("h3")): block.name = "div" - for block in soup.find_all("h1"): + for block in cast(list[Tag], soup.find_all("h1")): block.name = "h3" - for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): + for block in cast(list[Tag], soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap")): block.name = "h3" - block["class"] += ["accordion-head"] + block["class"] += ["accordion-head"] # type: ignore for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): children = list(dummy.children) @@ -97,7 +99,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: if figure := video.find_parent("figure"): figure.decompose() - for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): + for hrule_imposter in cast(list[Tag], soup.find_all(class_="ilc_section_Separator")): hrule_imposter.insert(0, soup.new_tag("hr")) return soup diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 2fc399d..557150c 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -257,6 +257,7 @@ instance's greatest bottleneck. async with cl: next_stage_url: Optional[str] = url current_parent = current_element + page = None while next_stage_url: soup = await self._get_page(next_stage_url) @@ -278,6 +279,7 @@ instance's greatest bottleneck. else: next_stage_url = None + page = cast(IliasPage, page) elements.extend(page.get_child_elements()) if description_string := page.get_description(): description.append(description_string) @@ -461,10 +463,10 @@ instance's greatest bottleneck. if not dl: return - async with dl as (bar, sink): + async with dl as (_bar, sink): description = clean(insert_base_markup(description)) - description = await self.internalize_images(description) - sink.file.write(description.prettify().encode("utf-8")) + description_tag = await self.internalize_images(description) + sink.file.write(cast(str, description_tag.prettify()).encode("utf-8")) sink.done() @anoncritical @@ -483,7 +485,7 @@ instance's greatest bottleneck. async with self.session.get(export_url, allow_redirects=False) as resp: # No redirect means we were authenticated if hdrs.LOCATION not in resp.headers: - return soupify(await resp.read()).select_one("a").get("href").strip() + return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore # We are either unauthenticated or the link is not active new_url = resp.headers[hdrs.LOCATION].lower() if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url: @@ -707,6 +709,8 @@ instance's greatest bottleneck. async with cl: next_stage_url = element.url + page = None + while next_stage_url: log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") log.explain(f"URL: {next_stage_url}") @@ -719,7 +723,7 @@ instance's greatest bottleneck. else: break - download_data = page.get_download_forum_data() + download_data = cast(IliasPage, page).get_download_forum_data() if not download_data: raise CrawlWarning("Failed to extract forum data") if download_data.empty: @@ -751,8 +755,8 @@ instance's greatest bottleneck. async with maybe_dl as (bar, sink): content = "\n" - content += element.title_tag.prettify() - content += element.content_tag.prettify() + content += cast(str, element.title_tag.prettify()) + content += cast(str, element.content_tag.prettify()) sink.file.write(content.encode("utf-8")) sink.done() @@ -877,15 +881,15 @@ instance's greatest bottleneck. continue if elem.name == "img": if src := elem.attrs.get("src", None): - url = urljoin(self._base_url, src) + url = urljoin(self._base_url, cast(str, src)) if not url.startswith(self._base_url): continue log.explain(f"Internalizing {url!r}") img = await self._get_authenticated(url) elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() - if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + if elem.name == "iframe" and cast(str, elem.attrs.get("src", "")).startswith("//"): # For unknown reasons the protocol seems to be stripped. - elem.attrs["src"] = "https:" + elem.attrs["src"] + elem.attrs["src"] = "https:" + cast(str, elem.attrs["src"]) return tag def _ensure_not_seen(self, element: IliasPageElement, parent_path: PurePath) -> None: @@ -979,11 +983,11 @@ instance's greatest bottleneck. async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: login_page = soupify(await request.read()) - login_form = login_page.find("form", attrs={"name": "formlogin"}) + login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"})) if login_form is None: raise CrawlError("Could not find the login form! Specified client id might be invalid.") - login_url = login_form.attrs.get("action") + login_url = cast(Optional[str], login_form.attrs.get("action")) if login_url is None: raise CrawlError("Could not find the action URL in the login form!") @@ -1004,14 +1008,14 @@ instance's greatest bottleneck. @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages - mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") + mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar")) if mainbar is not None: - login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) + login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x}) shib_login = soup.find(id="button_shib_login") return not login_button and not shib_login # Personal Desktop - if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}): return True # Video listing embeds do not have complete ILIAS html. Try to match them by diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 57c81e5..ee61cab 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import Dict, List, Optional, Union, cast +from typing import Dict, Optional, Union, cast from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -117,7 +117,7 @@ class IliasPageElement: @dataclass class IliasDownloadForumData: url: str - form_data: Dict[str, Union[str, List[str]]] + form_data: Dict[str, Union[str, list[str]]] empty: bool @@ -151,7 +151,7 @@ class IliasPage: return "goto.php?target=root_" in permalink return False - def get_child_elements(self) -> List[IliasPageElement]: + def get_child_elements(self) -> list[IliasPageElement]: """ Return all child page elements you can find here. """ @@ -177,10 +177,10 @@ class IliasPage: return self._find_normal_entries() def get_info_tab(self) -> Optional[IliasPageElement]: - tab: Optional[Tag] = self._soup.find( + tab: Optional[Tag] = cast(Optional[Tag], self._soup.find( name="a", - attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x} - ) + attrs={"href": lambda x: x is not None and "cmdClass=ilinfoscreengui" in x} + )) if tab is not None: return IliasPageElement.create_new( IliasElementType.INFO_TAB, @@ -193,7 +193,7 @@ class IliasPage: def is_interesting_class(name: str) -> bool: return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] - paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) + paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class)) if not paragraphs: return None @@ -217,8 +217,8 @@ class IliasPage: def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: if not self._is_learning_module_page(): return None - content = self._soup.select_one("#ilLMPageContent") - title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + content = cast(Tag, self._soup.select_one("#ilLMPageContent")) + title = cast(Tag, self._soup.select_one(".ilc_page_title_PageTitle")).get_text().strip() return IliasLearningModulePage( title=title, content=content, @@ -243,15 +243,18 @@ class IliasPage: return None def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: - form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) + form = cast(Optional[Tag], self._soup.find( + "form", + attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x} + )) if not form: return None - post_url = self._abs_url_from_relative(form["action"]) + post_url = self._abs_url_from_relative(cast(str, form["action"])) - thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] + thread_ids = [f["value"] for f in cast(list[Tag], form.find_all(attrs={"name": "thread_ids[]"}))] - form_data: Dict[str, Union[str, List[str]]] = { - "thread_ids[]": thread_ids, + form_data: Dict[str, Union[str, list[str]]] = { + "thread_ids[]": cast(list[str], thread_ids), "selected_cmd2": "html", "select_cmd2": "Ausführen", "selected_cmd": "", @@ -285,7 +288,7 @@ class IliasPage: def _is_forum_page(self) -> bool: read_more_btn = self._soup.find( "button", - attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} + attrs={"onclick": lambda x: x is not None and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} ) return read_more_btn is not None @@ -297,7 +300,7 @@ class IliasPage: return True # Raw listing without ILIAS fluff - video_element_table: Tag = self._soup.find( + video_element_table = self._soup.find( name="table", id=re.compile(r"tbl_xoct_.+") ) return video_element_table is not None @@ -305,8 +308,8 @@ class IliasPage: def _is_ilias_opencast_embedding(self) -> bool: # ILIAS fluff around the real opencast html if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "opencast" in element.attrs["src"].lower(): + element: Tag = cast(Tag, self._soup.find(id="headerimage")) + if "opencast" in cast(str, element.attrs["src"]).lower(): return True return False @@ -317,8 +320,8 @@ class IliasPage: # We have no suitable parent - let's guesss if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "exc" in element.attrs["src"].lower(): + element: Tag = cast(Tag, self._soup.find(id="headerimage")) + if "exc" in cast(str, element.attrs["src"]).lower(): return True return False @@ -340,10 +343,10 @@ class IliasPage: return self._uncollapse_future_meetings_url() is not None def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: - element = self._soup.find( + element = cast(Optional[Tag], self._soup.find( "a", - attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)} - ) + attrs={"href": lambda x: x is not None and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)} + )) if not element: return None link = self._abs_url_from_link(element) @@ -360,24 +363,24 @@ class IliasPage: return "baseClass=ilmembershipoverviewgui" in self._page_url def _select_content_page_url(self) -> Optional[IliasPageElement]: - tab = self._soup.find( + tab = cast(Optional[Tag], self._soup.find( id="tab_view_content", attrs={"class": lambda x: x is not None and "active" not in x} - ) + )) # Already selected (or not found) if not tab: return None - link = tab.find("a") + link = cast(Optional[Tag], tab.find("a")) if link: - link = self._abs_url_from_link(link) - return IliasPageElement.create_new(IliasElementType.FOLDER, link, "select content page") + link_str = self._abs_url_from_link(link) + return IliasPageElement.create_new(IliasElementType.FOLDER, link_str, "select content page") _unexpected_html_warning() log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") log.warn_contd("PFERD might not find content on the course's main page.") return None - def _player_to_video(self) -> List[IliasPageElement]: + def _player_to_video(self) -> list[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere # on the page, but defined in a JS object inside a script tag, passed to the player @@ -414,10 +417,10 @@ class IliasPage: return items def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: - correct_link = self._soup.find( + correct_link = cast(Optional[Tag], self._soup.find( "a", - attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} - ) + attrs={"href": lambda x: x is not None and "trows=800" in x and "cmd=showThreads" in x} + )) if not correct_link: return None @@ -426,15 +429,15 @@ class IliasPage: return IliasPageElement.create_new(IliasElementType.FORUM, link, "show all forum threads") - def _find_personal_desktop_entries(self) -> List[IliasPageElement]: - items: List[IliasPageElement] = [] + def _find_personal_desktop_entries(self) -> list[IliasPageElement]: + items: list[IliasPageElement] = [] - titles: List[Tag] = self._soup.select("#block_pditems_0 .il-item-title") + titles: list[Tag] = self._soup.select("#block_pditems_0 .il-item-title") for title in titles: - link = title.find("a") + link = cast(Optional[Tag], title.find("a")) if not link: - log.explain(f"Skipping offline item: {title.getText().strip()!r}") + log.explain(f"Skipping offline item: {title.get_text().strip()!r}") continue name = _sanitize_path_name(link.text.strip()) @@ -460,13 +463,13 @@ class IliasPage: return items - def _find_copa_entries(self) -> List[IliasPageElement]: - items: List[IliasPageElement] = [] - links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") + def _find_copa_entries(self) -> list[IliasPageElement]: + items: list[IliasPageElement] = [] + links: list[Tag] = cast(list[Tag], self._soup.find_all(class_="ilc_flist_a_FileListItemLink")) for link in links: url = self._abs_url_from_link(link) - name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.getText()).strip().replace("\t", "") + name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "") name = _sanitize_path_name(name) if "file_id" not in url: @@ -478,9 +481,9 @@ class IliasPage: return items - def _find_info_tab_entries(self) -> List[IliasPageElement]: + def _find_info_tab_entries(self) -> list[IliasPageElement]: items = [] - links: List[Tag] = self._soup.select("a.il_ContainerItemCommand") + links: list[Tag] = self._soup.select("a.il_ContainerItemCommand") for link in links: if "cmdClass=ilobjcoursegui" not in link["href"]: @@ -490,12 +493,12 @@ class IliasPage: items.append(IliasPageElement.create_new( IliasElementType.FILE, self._abs_url_from_link(link), - _sanitize_path_name(link.getText()) + _sanitize_path_name(link.get_text()) )) return items - def _find_opencast_video_entries(self) -> List[IliasPageElement]: + def _find_opencast_video_entries(self) -> list[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated @@ -503,14 +506,14 @@ class IliasPage: # # We need to figure out where we are. - video_element_table: Tag = self._soup.find( + video_element_table = cast(Optional[Tag], self._soup.find( name="table", id=re.compile(r"tbl_xoct_.+") - ) + )) if video_element_table is None: # We are in stage 1 # The page is actually emtpy but contains the link to stage 2 - content_link: Tag = self._soup.select_one("#tab_series a") + content_link: Tag = cast(Tag, self._soup.select_one("#tab_series a")) url: str = self._abs_url_from_link(content_link) query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) @@ -527,14 +530,14 @@ class IliasPage: return self._find_opencast_video_entries_no_paging() - def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: - table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) + def _find_opencast_video_entries_paginated(self) -> list[IliasPageElement]: + table_element = cast(Optional[Tag], self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+"))) if table_element is None: log.warn("Couldn't increase elements per page (table not found). I might miss elements.") return self._find_opencast_video_entries_no_paging() - id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) + id_match = re.match(r"tbl_xoct_(.+)", cast(str, table_element.attrs["id"])) if id_match is None: log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") return self._find_opencast_video_entries_no_paging() @@ -548,16 +551,16 @@ class IliasPage: log.explain("Disabled pagination, retrying folder as a new entry") return [IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] - def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_no_paging(self) -> list[IliasPageElement]: """ Crawls the "second stage" video page. This page contains the actual video urls. """ # Video start links are marked with an "Abspielen" link - video_links: List[Tag] = self._soup.findAll( + video_links = cast(list[Tag], self._soup.find_all( name="a", text=re.compile(r"\s*(Abspielen|Play)\s*") - ) + )) - results: List[IliasPageElement] = [] + results: list[IliasPageElement] = [] for link in video_links: results.append(self._listed_opencast_video_to_element(link)) @@ -569,12 +572,12 @@ class IliasPage: # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date modification_time = None - row: Tag = link.parent.parent.parent + row: Tag = link.parent.parent.parent # type: ignore column_count = len(row.select("td.std")) for index in range(column_count, 0, -1): - modification_string = link.parent.parent.parent.select_one( + modification_string = link.parent.parent.parent.select_one( # type: ignore f"td.std:nth-child({index})" - ).getText().strip() + ).get_text().strip() if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string): modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M") break @@ -583,7 +586,7 @@ class IliasPage: log.warn(f"Could not determine upload time for {link}") modification_time = datetime.now() - title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() + title = link.parent.parent.parent.select_one("td.std:nth-child(3)").get_text().strip() # type: ignore title += ".mp4" video_name: str = _sanitize_path_name(title) @@ -595,33 +598,34 @@ class IliasPage: IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time ) - def _find_exercise_entries(self) -> List[IliasPageElement]: + def _find_exercise_entries(self) -> list[IliasPageElement]: if self._soup.find(id="tab_submission"): log.explain("Found submission tab. This is an exercise detail page") return self._find_exercise_entries_detail_page() log.explain("Found no submission tab. This is an exercise root page") return self._find_exercise_entries_root_page() - def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: - results: List[IliasPageElement] = [] + def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]: + results: list[IliasPageElement] = [] # Find all download links in the container (this will contain all the files) - download_links: List[Tag] = self._soup.findAll( + download_links = cast(list[Tag], self._soup.find_all( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmd=download" in x}, + attrs={"href": lambda x: x is not None and "cmd=download" in x}, text="Download" - ) + )) for link in download_links: - parent_row: Tag = link.findParent("tr") - children: List[Tag] = parent_row.findChildren("td") + parent_row: Tag = cast(Tag, link.find_parent("tr")) + children = cast(list[Tag], parent_row.find_all("td")) - name = _sanitize_path_name(children[1].getText().strip()) + name = _sanitize_path_name(children[1].get_text().strip()) log.explain(f"Found exercise detail entry {name!r}") + date = None for child in reversed(children): - date = demangle_date(child.getText().strip(), fail_silently=True) + date = demangle_date(child.get_text().strip(), fail_silently=True) if date is not None: break if date is None: @@ -636,30 +640,33 @@ class IliasPage: return results - def _find_exercise_entries_root_page(self) -> List[IliasPageElement]: - results: List[IliasPageElement] = [] + def _find_exercise_entries_root_page(self) -> list[IliasPageElement]: + results: list[IliasPageElement] = [] # Each assignment is in an accordion container - assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") + assignment_containers: list[Tag] = self._soup.select(".il_VAccordionInnerContainer") for container in assignment_containers: # Fetch the container name out of the header to use it in the path - container_name = container.select_one(".ilAssignmentHeader").getText().strip() + container_name = cast(Tag, container.select_one(".ilAssignmentHeader")).get_text().strip() log.explain(f"Found exercise container {container_name!r}") # Find all download links in the container (this will contain all the files) - files: List[Tag] = container.findAll( + files = cast(list[Tag], container.find_all( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, + attrs={"href": lambda x: x is not None and "cmdClass=ilexsubmissiongui" in x}, text="Download" - ) + )) # Grab each file as you now have the link for file_link in files: # Two divs, side by side. Left is the name, right is the link ==> get left # sibling - file_name = file_link.parent.findPrevious(name="div").getText().strip() + file_name = cast( + Tag, + cast(Tag, file_link.parent).find_previous(name="div") + ).get_text().strip() url = self._abs_url_from_link(file_link) log.explain(f"Found exercise entry {file_name!r}") @@ -672,21 +679,21 @@ class IliasPage: )) # Find all links to file listings (e.g. "Submitted Files" for groups) - file_listings: List[Tag] = container.findAll( + file_listings = cast(list[Tag], container.find_all( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()} - ) + attrs={"href": lambda x: x is not None and "cmdclass=ilexsubmissionfilegui" in x.lower()} + )) # Add each listing as a new for listing in file_listings: - parent_container: Tag = listing.findParent( - "div", attrs={"class": lambda x: x and "form-group" in x} - ) - label_container: Tag = parent_container.find( - attrs={"class": lambda x: x and "control-label" in x} - ) - file_name = label_container.getText().strip() + parent_container = cast(Tag, listing.find_parent( + "div", attrs={"class": lambda x: x is not None and "form-group" in x} + )) + label_container = cast(Tag, parent_container.find( + attrs={"class": lambda x: x is not None and "control-label" in x} + )) + file_name = label_container.get_text().strip() url = self._abs_url_from_link(listing) log.explain(f"Found exercise detail {file_name!r} at {url}") results.append(IliasPageElement.create_new( @@ -699,10 +706,10 @@ class IliasPage: return results - def _find_normal_entries(self) -> List[IliasPageElement]: - result: List[IliasPageElement] = [] + def _find_normal_entries(self) -> list[IliasPageElement]: + result: list[IliasPageElement] = [] - links: List[Tag] = [] + links: list[Tag] = [] # Fetch all links and throw them to the general interpreter if self._is_course_overview_page(): log.explain("Page is a course overview page, adjusting link selector") @@ -716,9 +723,9 @@ class IliasPage: parents = [_sanitize_path_name(x) for x in self._find_upwards_folder_hierarchy(link)] if parents: - element_name = "/".join(parents) + "/" + _sanitize_path_name(link.getText()) + element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text()) else: - element_name = _sanitize_path_name(link.getText()) + element_name = _sanitize_path_name(link.get_text()) element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) @@ -750,17 +757,17 @@ class IliasPage: return result - def _find_mediacast_videos(self) -> List[IliasPageElement]: - videos: List[IliasPageElement] = [] + def _find_mediacast_videos(self) -> list[IliasPageElement]: + videos: list[IliasPageElement] = [] - for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): + for elem in cast(list[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): element_name = _sanitize_path_name( - elem.select_one(".ilPlayerPreviewDescription").getText().strip() + cast(Tag, elem.select_one(".ilPlayerPreviewDescription")).get_text().strip() ) if not element_name.endswith(".mp4"): # just to make sure it has some kinda-alrightish ending element_name = element_name + ".mp4" - video_element = elem.find(name="video") + video_element = cast(Optional[Tag], elem.find(name="video")) if not video_element: _unexpected_html_warning() log.warn_contd(f"No