Compare commits

...

4 Commits

Author SHA1 Message Date
Joscha
da627ff929 Bump version to 3.5.1 2024-04-09 14:28:56 +02:00
I-Al-Istannen
c1b592ac29 Fix ILIAS 8 file downloads truncating to zero bytes 2024-04-08 17:59:41 +02:00
I-Al-Istannen
eb0c956d32 Add compatibility with ILIAS 8 2024-04-05 19:08:05 +02:00
TornaxO7
ab0cb2d956 nix: bump nixpgs dependency 2024-02-27 23:39:53 +01:00
6 changed files with 70 additions and 44 deletions

View File

@@ -22,6 +22,11 @@ ambiguous situations.
## Unreleased
## 3.5.1 - 2024-04-09
### Added
- Support for ILIAS 8
### Fixed
- Video name deduplication

View File

@@ -95,13 +95,9 @@ class IliasPage:
@staticmethod
def is_root_page(soup: BeautifulSoup) -> bool:
permalink = soup.find(id="current_perma_link")
if permalink is None:
if permalink := IliasPage.get_soup_permalink(soup):
return "goto.php?target=root_" in permalink
return False
value = permalink.attrs.get("value")
if value is None:
return False
return "goto.php?target=root_" in value
def get_child_elements(self) -> List[IliasPageElement]:
"""
@@ -279,16 +275,14 @@ class IliasPage:
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
def _is_content_page(self) -> bool:
link = self._soup.find(id="current_perma_link")
if not link:
if link := self.get_permalink():
return "target=copa_" in link
return False
return "target=copa_" in link.get("value")
def _is_learning_module_page(self) -> bool:
link = self._soup.find(id="current_perma_link")
if not link:
if link := self.get_permalink():
return "target=pg_" in link
return False
return "target=pg_" in link.get("value")
def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None
@@ -513,8 +507,8 @@ class IliasPage:
modification_string = link.parent.parent.parent.select_one(
f"td.std:nth-child({index})"
).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
break
if modification_time is None:
@@ -613,7 +607,7 @@ class IliasPage:
file_listings: List[Tag] = container.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
)
# Add each listing as a new
@@ -1095,6 +1089,9 @@ class IliasPage:
return True
return False
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
def _abs_url_from_link(self, link_tag: Tag) -> str:
"""
Create an absolute url from an <a> tag.
@@ -1107,6 +1104,13 @@ class IliasPage:
"""
return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
if not perma_link_element or not perma_link_element.get("href"):
return None
return perma_link_element.get("href")
def _unexpected_html_warning() -> None:
log.warn("Encountered unexpected HTML structure, ignoring element.")

View File

@@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
raise CrawlError("Impossible return in ilias _iorepeat")
return wrapper # type: ignore
return decorator
@@ -253,8 +254,8 @@ instance's greatest bottleneck.
soup = await self._get_page(next_stage_url, root_page_allowed=True)
if current_parent is None and expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link or "crs_" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -674,12 +675,28 @@ instance's greatest bottleneck.
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool:
async with self.session.get(url, allow_redirects=is_video) as resp:
next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
# our authentication expired.
if not is_video:
# Redirect means we weren't authenticated
async with self.session.get(url, allow_redirects=False) as resp:
# Redirect to anything except a "sendfile" means we weren't authenticated
if hdrs.LOCATION in resp.headers:
if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
return False
# we wanted a video but got HTML
# Directly follow the redirect to not make a second, unnecessary request
next_url = resp.headers[hdrs.LOCATION]
# Let's try this again and follow redirects
return await fetch_follow_redirects(next_url)
async def fetch_follow_redirects(file_url: str) -> bool:
async with self.session.get(file_url) as resp:
# We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
# solve that depending on the setup, but it is better than nothing.
if is_video and "html" in resp.content_type:
return False

View File

@@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.5.0"
VERSION = "3.5.1"

8
flake.lock generated
View File

@@ -2,16 +2,16 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1694499547,
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
"lastModified": 1708979614,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-23.05",
"ref": "nixos-23.11",
"repo": "nixpkgs",
"type": "github"
}

View File

@@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
};
outputs = { self, nixpkgs }: