Compare commits

...

4 Commits

Author SHA1 Message Date
Joscha
da627ff929 Bump version to 3.5.1 2024-04-09 14:28:56 +02:00
I-Al-Istannen
c1b592ac29 Fix ILIAS 8 file downloads truncating to zero bytes 2024-04-08 17:59:41 +02:00
I-Al-Istannen
eb0c956d32 Add compatibility with ILIAS 8 2024-04-05 19:08:05 +02:00
TornaxO7
ab0cb2d956 nix: bump nixpgs dependency 2024-02-27 23:39:53 +01:00
6 changed files with 70 additions and 44 deletions

View File

@@ -22,6 +22,11 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.5.1 - 2024-04-09
### Added
- Support for ILIAS 8
### Fixed ### Fixed
- Video name deduplication - Video name deduplication

View File

@@ -95,13 +95,9 @@ class IliasPage:
@staticmethod @staticmethod
def is_root_page(soup: BeautifulSoup) -> bool: def is_root_page(soup: BeautifulSoup) -> bool:
permalink = soup.find(id="current_perma_link") if permalink := IliasPage.get_soup_permalink(soup):
if permalink is None: return "goto.php?target=root_" in permalink
return False return False
value = permalink.attrs.get("value")
if value is None:
return False
return "goto.php?target=root_" in value
def get_child_elements(self) -> List[IliasPageElement]: def get_child_elements(self) -> List[IliasPageElement]:
""" """
@@ -279,16 +275,14 @@ class IliasPage:
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
def _is_content_page(self) -> bool: def _is_content_page(self) -> bool:
link = self._soup.find(id="current_perma_link") if link := self.get_permalink():
if not link: return "target=copa_" in link
return False return False
return "target=copa_" in link.get("value")
def _is_learning_module_page(self) -> bool: def _is_learning_module_page(self) -> bool:
link = self._soup.find(id="current_perma_link") if link := self.get_permalink():
if not link: return "target=pg_" in link
return False return False
return "target=pg_" in link.get("value")
def _contains_collapsed_future_meetings(self) -> bool: def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None return self._uncollapse_future_meetings_url() is not None
@@ -513,8 +507,8 @@ class IliasPage:
modification_string = link.parent.parent.parent.select_one( modification_string = link.parent.parent.parent.select_one(
f"td.std:nth-child({index})" f"td.std:nth-child({index})"
).getText().strip() ).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
break break
if modification_time is None: if modification_time is None:
@@ -613,7 +607,7 @@ class IliasPage:
file_listings: List[Tag] = container.findAll( file_listings: List[Tag] = container.findAll(
name="a", name="a",
# download links contain the given command class # download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
) )
# Add each listing as a new # Add each listing as a new
@@ -1095,6 +1089,9 @@ class IliasPage:
return True return True
return False return False
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, link_tag: Tag) -> str:
""" """
Create an absolute url from an <a> tag. Create an absolute url from an <a> tag.
@@ -1107,6 +1104,13 @@ class IliasPage:
""" """
return urljoin(self._page_url, relative_url) return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
if not perma_link_element or not perma_link_element.get("href"):
return None
return perma_link_element.get("href")
def _unexpected_html_warning() -> None: def _unexpected_html_warning() -> None:
log.warn("Encountered unexpected HTML structure, ignoring element.") log.warn("Encountered unexpected HTML structure, ignoring element.")

View File

@@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
raise CrawlError("Impossible return in ilias _iorepeat") raise CrawlError("Impossible return in ilias _iorepeat")
return wrapper # type: ignore return wrapper # type: ignore
return decorator return decorator
@@ -253,8 +254,8 @@ instance's greatest bottleneck.
soup = await self._get_page(next_stage_url, root_page_allowed=True) soup = await self._get_page(next_stage_url, root_page_allowed=True)
if current_parent is None and expected_id is not None: if current_parent is None and expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link") perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link_element or "crs_" not in perma_link_element.get("value"): if not perma_link or "crs_" not in perma_link:
raise CrawlError("Invalid course id? Didn't find anything looking like a course") raise CrawlError("Invalid course id? Didn't find anything looking like a course")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -674,12 +675,28 @@ instance's greatest bottleneck.
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool: async def try_stream() -> bool:
async with self.session.get(url, allow_redirects=is_video) as resp: next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
# our authentication expired.
if not is_video: if not is_video:
# Redirect means we weren't authenticated async with self.session.get(url, allow_redirects=False) as resp:
# Redirect to anything except a "sendfile" means we weren't authenticated
if hdrs.LOCATION in resp.headers: if hdrs.LOCATION in resp.headers:
if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
return False return False
# we wanted a video but got HTML # Directly follow the redirect to not make a second, unnecessary request
next_url = resp.headers[hdrs.LOCATION]
# Let's try this again and follow redirects
return await fetch_follow_redirects(next_url)
async def fetch_follow_redirects(file_url: str) -> bool:
async with self.session.get(file_url) as resp:
# We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
# solve that depending on the setup, but it is better than nothing.
if is_video and "html" in resp.content_type: if is_video and "html" in resp.content_type:
return False return False

View File

@@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.5.0" VERSION = "3.5.1"

8
flake.lock generated
View File

@@ -2,16 +2,16 @@
"nodes": { "nodes": {
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1694499547, "lastModified": 1708979614,
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=", "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24", "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixos-23.05", "ref": "nixos-23.11",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }

View File

@@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS"; description = "Tool for downloading course-related files from ILIAS";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
}; };
outputs = { self, nixpkgs }: outputs = { self, nixpkgs }: