Compare commits

...

8 Commits

6 changed files with 89 additions and 45 deletions

View File

@ -22,6 +22,17 @@ ambiguous situations.
## Unreleased
## 3.5.2 - 2024-04-14
### Fixed
- Crawling of personal desktop with ILIAS 8
- Crawling of empty personal desktops
## 3.5.1 - 2024-04-09
### Added
- Support for ILIAS 8
### Fixed
- Video name deduplication

View File

@ -17,7 +17,7 @@ TargetType = Union[str, int]
class IliasElementType(Enum):
EXERCISE = "exercise"
EXERCISE_FILES = "exercise_files" # own submitted files
TEST = "test" # an online test. Will be ignored currently.
TEST = "test" # an online test. Will be ignored currently.
FILE = "file"
FOLDER = "folder"
FORUM = "forum"
@ -95,13 +95,9 @@ class IliasPage:
@staticmethod
def is_root_page(soup: BeautifulSoup) -> bool:
permalink = soup.find(id="current_perma_link")
if permalink is None:
return False
value = permalink.attrs.get("value")
if value is None:
return False
return "goto.php?target=root_" in value
if permalink := IliasPage.get_soup_permalink(soup):
return "goto.php?target=root_" in permalink
return False
def get_child_elements(self) -> List[IliasPageElement]:
"""
@ -279,16 +275,14 @@ class IliasPage:
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
def _is_content_page(self) -> bool:
link = self._soup.find(id="current_perma_link")
if not link:
return False
return "target=copa_" in link.get("value")
if link := self.get_permalink():
return "target=copa_" in link
return False
def _is_learning_module_page(self) -> bool:
link = self._soup.find(id="current_perma_link")
if not link:
return False
return "target=pg_" in link.get("value")
if link := self.get_permalink():
return "target=pg_" in link
return False
def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None
@ -384,6 +378,10 @@ class IliasPage:
name = _sanitize_path_name(link.text.strip())
url = self._abs_url_from_link(link)
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
# Configure button/link does not have anything interesting
continue
type = self._find_type_from_link(name, link, url)
if not type:
_unexpected_html_warning()
@ -513,8 +511,8 @@ class IliasPage:
modification_string = link.parent.parent.parent.select_one(
f"td.std:nth-child({index})"
).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
break
if modification_time is None:
@ -613,7 +611,7 @@ class IliasPage:
file_listings: List[Tag] = container.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
)
# Add each listing as a new
@ -917,9 +915,9 @@ class IliasPage:
@staticmethod
def _find_type_from_link(
element_name: str,
link_element: Tag,
url: str
element_name: str,
link_element: Tag,
url: str
) -> Optional[IliasElementType]:
"""
Decides which sub crawler to use for a given top level element.
@ -1080,6 +1078,14 @@ class IliasPage:
if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
return True
# Empty personal desktop has zero (0) markers. Match on the text...
if alert := soup.select_one(".alert-info"):
text = alert.getText().lower()
if "you have not yet selected any favourites" in text:
return True
if "sie haben aktuell noch keine favoriten ausgewählt" in text:
return True
# Video listing embeds do not have complete ILIAS html. Try to match them by
# their video listing table
video_table = soup.find(
@ -1095,6 +1101,9 @@ class IliasPage:
return True
return False
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
def _abs_url_from_link(self, link_tag: Tag) -> str:
"""
Create an absolute url from an <a> tag.
@ -1107,6 +1116,13 @@ class IliasPage:
"""
return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
if not perma_link_element or not perma_link_element.get("href"):
return None
return perma_link_element.get("href")
def _unexpected_html_warning() -> None:
log.warn("Encountered unexpected HTML structure, ignoring element.")
@ -1130,7 +1146,7 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
date_str = date_str.strip()
for german, english in zip(german_months, english_months):
date_str = date_str.replace(german, english)

View File

@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
raise CrawlError("Impossible return in ilias _iorepeat")
return wrapper # type: ignore
return decorator
@ -177,11 +178,11 @@ def _get_video_cache_key(element: IliasPageElement) -> str:
class KitIliasWebCrawler(HttpCrawler):
def __init__(
self,
name: str,
section: KitIliasWebCrawlerSection,
config: Config,
authenticators: Dict[str, Authenticator]
self,
name: str,
section: KitIliasWebCrawlerSection,
config: Config,
authenticators: Dict[str, Authenticator]
):
# Setting a main authenticator for cookie sharing
auth = section.auth(authenticators)
@ -227,7 +228,7 @@ instance's greatest bottleneck.
await self._crawl_url(root_url, expected_id=course_id)
async def _crawl_desktop(self) -> None:
appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items"
appendix = appendix.encode("ASCII").hex()
await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)
@ -253,8 +254,8 @@ instance's greatest bottleneck.
soup = await self._get_page(next_stage_url, root_page_allowed=True)
if current_parent is None and expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link or "crs_" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@ -674,12 +675,28 @@ instance's greatest bottleneck.
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool:
async with self.session.get(url, allow_redirects=is_video) as resp:
if not is_video:
# Redirect means we weren't authenticated
next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
# our authentication expired.
if not is_video:
async with self.session.get(url, allow_redirects=False) as resp:
# Redirect to anything except a "sendfile" means we weren't authenticated
if hdrs.LOCATION in resp.headers:
return False
# we wanted a video but got HTML
if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
return False
# Directly follow the redirect to not make a second, unnecessary request
next_url = resp.headers[hdrs.LOCATION]
# Let's try this again and follow redirects
return await fetch_follow_redirects(next_url)
async def fetch_follow_redirects(file_url: str) -> bool:
async with self.session.get(file_url) as resp:
# We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
# solve that depending on the setup, but it is better than nothing.
if is_video and "html" in resp.content_type:
return False
@ -1052,9 +1069,9 @@ class KitShibbolethLogin:
await sess.post(url, data=data)
async def _authenticate_tfa(
self,
session: aiohttp.ClientSession,
soup: BeautifulSoup
self,
session: aiohttp.ClientSession,
soup: BeautifulSoup
) -> BeautifulSoup:
if not self._tfa_auth:
self._tfa_auth = TfaAuthenticator("ilias-anon-tfa")

View File

@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.5.0"
VERSION = "3.5.2"

8
flake.lock generated
View File

@ -2,16 +2,16 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1694499547,
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
"lastModified": 1708979614,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-23.05",
"ref": "nixos-23.11",
"repo": "nixpkgs",
"type": "github"
}

View File

@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
};
outputs = { self, nixpkgs }: