Bump version to 3.5.1

Fix ILIAS 8 file downloads truncating to zero bytes
Add compatibility with ILIAS 8
2025-07-15 23:42:35 +02:00 · 2024-04-09 14:28:56 +02:00 · 2024-04-08 17:59:41 +02:00 · 2024-04-05 19:08:05 +02:00 · 2024-02-27 23:39:53 +01:00
8 changed files with 75 additions and 69 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,11 @@ ambiguous situations.
 ## Unreleased
 ## 3.5.1 - 2024-04-09
 ### Added
 - Support for ILIAS 8
 ### Fixed
 - Video name deduplication
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -92,9 +92,6 @@ common to all crawlers:
  load for the crawl target. (Default: `0.0`)
 - `windows_paths`: Whether PFERD should find alternative names for paths that
  are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
 - `aliases`: List of strings that are considered as an alias when invoking with
  the `--crawler` or `-C` flag. If there is more than one crawl section with
  the same aliases all are selected. Thereby, you can group different crawlers.
 Some crawlers may also require credentials for authentication. To configure how
 the crawler obtains its credentials, the `auth` option is used. It is set to the
@@ -109,7 +106,6 @@ username = foo
 password = bar
 [crawl:something]
 aliases = [sth, some]
 type = some-complex-crawler
 auth = auth:example
 on_conflict = no-delete
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -95,13 +95,9 @@ class IliasPage:
    @staticmethod
    def is_root_page(soup: BeautifulSoup) -> bool:
-        permalink = soup.find(id="current_perma_link")
+        if permalink := IliasPage.get_soup_permalink(soup):
-        if permalink is None:
+            return "goto.php?target=root_" in permalink
        return False
        value = permalink.attrs.get("value")
        if value is None:
            return False
        return "goto.php?target=root_" in value
    def get_child_elements(self) -> List[IliasPageElement]:
        """
@@ -279,16 +275,14 @@ class IliasPage:
        return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
    def _is_content_page(self) -> bool:
-        link = self._soup.find(id="current_perma_link")
+        if link := self.get_permalink():
-        if not link:
+            return "target=copa_" in link
        return False
        return "target=copa_" in link.get("value")
    def _is_learning_module_page(self) -> bool:
-        link = self._soup.find(id="current_perma_link")
+        if link := self.get_permalink():
-        if not link:
+            return "target=pg_" in link
        return False
        return "target=pg_" in link.get("value")
    def _contains_collapsed_future_meetings(self) -> bool:
        return self._uncollapse_future_meetings_url() is not None
@@ -513,8 +507,8 @@ class IliasPage:
            modification_string = link.parent.parent.parent.select_one(
                f"td.std:nth-child({index})"
            ).getText().strip()
-            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
+            if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
-                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
+                modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
                break
        if modification_time is None:
@@ -613,7 +607,7 @@ class IliasPage:
            file_listings: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
-                attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
+                attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
            )
            # Add each listing as a new
@@ -1095,6 +1089,9 @@ class IliasPage:
            return True
        return False
    def get_permalink(self) -> Optional[str]:
        return IliasPage.get_soup_permalink(self._soup)
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        """
        Create an absolute url from an <a> tag.
@@ -1107,6 +1104,13 @@ class IliasPage:
        """
        return urljoin(self._page_url, relative_url)
    @staticmethod
    def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
        perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
        if not perma_link_element or not perma_link_element.get("href"):
            return None
        return perma_link_element.get("href")
 def _unexpected_html_warning() -> None:
    log.warn("Encountered unexpected HTML structure, ignoring element.")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
            raise CrawlError("Impossible return in ilias _iorepeat")
        return wrapper  # type: ignore
    return decorator
@@ -253,8 +254,8 @@ instance's greatest bottleneck.
                    soup = await self._get_page(next_stage_url, root_page_allowed=True)
                    if current_parent is None and expected_id is not None:
-                        perma_link_element: Tag = soup.find(id="current_perma_link")
+                        perma_link = IliasPage.get_soup_permalink(soup)
-                        if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+                        if not perma_link or "crs_" not in perma_link:
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")
                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -674,12 +675,28 @@ instance's greatest bottleneck.
    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
        async def try_stream() -> bool:
-            async with self.session.get(url, allow_redirects=is_video) as resp:
+            next_url = url
            # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
            # we can not match on the content type here. Instead, we disallow redirects and inspect the
            # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
            # our authentication expired.
            if not is_video:
-                    # Redirect means we weren't authenticated
+                async with self.session.get(url, allow_redirects=False) as resp:
                    # Redirect to anything except a "sendfile" means we weren't authenticated
                    if hdrs.LOCATION in resp.headers:
                        if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
                            return False
-                # we wanted a video but got HTML
+                        # Directly follow the redirect to not make a second, unnecessary request
                        next_url = resp.headers[hdrs.LOCATION]
            # Let's try this again and follow redirects
            return await fetch_follow_redirects(next_url)
        async def fetch_follow_redirects(file_url: str) -> bool:
            async with self.session.get(file_url) as resp:
                # We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
                # solve that depending on the setup, but it is better than nothing.
                if is_video and "html" in resp.content_type:
                    return False
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional
 from rich.markup import escape
@@ -43,24 +43,16 @@ class Pferd:
        crawl_sections = [name for name, _ in config.crawl_sections()]
-        crawlers_to_run = set()  # With crawl: prefix
+        crawlers_to_run = []  # With crawl: prefix
        unknown_names = []  # Without crawl: prefix
        for name in cli_crawlers:
            section_name = f"crawl:{name}"
            if section_name in crawl_sections:
                log.explain(f"Crawler section named {section_name!r} exists")
-                crawlers_to_run.add(section_name)
+                crawlers_to_run.append(section_name)
-            # interprete name as alias of a crawler
+            else:
-            alias_names = self._find_crawlers_by_alias(name, config)
+                log.explain(f"There's no crawler section named {section_name!r}")
            if alias_names:
                crawlers_to_run.update(alias_names)
                log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
                for alias_name in alias_names:
                    log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
            if not section_name in crawl_sections and not alias_names:
                log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
                unknown_names.append(name)
        if unknown_names:
@@ -73,14 +65,6 @@ class Pferd:
        return crawlers_to_run
    def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
        alias_names = set()
        for (section_name, section) in config.crawl_sections():
            section_aliases = section.get("aliases", [])
            if alias in section_aliases:
                alias_names.add(section_name)
        return alias_names
    def _find_crawlers_to_run(
            self,
            config: Config,
--- a/PFERD/version.py
+++ b/PFERD/version.py
@@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.5.0"
+VERSION = "3.5.1"
--- a/flake.lock
+++ b/flake.lock
@@ -2,16 +2,16 @@
  "nodes": {
    "nixpkgs": {
      "locked": {
-        "lastModified": 1694499547,
+        "lastModified": 1708979614,
-        "narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
+        "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
+        "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-23.05",
+        "ref": "nixos-23.11",
        "repo": "nixpkgs",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Tool for downloading course-related files from ILIAS";
  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
  };
  outputs = { self, nixpkgs }:
Author	SHA1	Message	Date
Joscha	da627ff929	Bump version to 3.5.1	2024-04-09 14:28:56 +02:00
I-Al-Istannen	c1b592ac29	Fix ILIAS 8 file downloads truncating to zero bytes	2024-04-08 17:59:41 +02:00
I-Al-Istannen	eb0c956d32	Add compatibility with ILIAS 8	2024-04-05 19:08:05 +02:00
TornaxO7	ab0cb2d956	nix: bump nixpgs dependency	2024-02-27 23:39:53 +01:00
`@@ -1,2 +1,2 @@`
	`NAME = "PFERD"`	`NAME = "PFERD"`
	`VERSION = "3.5.0"`	`VERSION = "3.5.1"`