Bump version to 3.5.2

Fix personal desktop crawling HTML warnings
Fix personal desktop crawling without favorites
2025-07-15 15:32:36 +02:00 · 2024-04-14 12:10:17 +02:00 · 2024-04-10 11:15:25 +02:00 · 2024-04-10 11:15:25 +02:00 · 2024-04-10 01:20:37 +02:00 · 2024-04-09 14:28:56 +02:00
8 changed files with 94 additions and 70 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,17 @@ ambiguous situations.

 ## Unreleased

+## 3.5.2 - 2024-04-14
+
+### Fixed
+- Crawling of personal desktop with ILIAS 8
+- Crawling of empty personal desktops
+
+## 3.5.1 - 2024-04-09
+
+### Added
+- Support for ILIAS 8
+
 ### Fixed
 - Video name deduplication

--- a/CONFIG.md
+++ b/CONFIG.md
@@ -92,9 +92,6 @@ common to all crawlers:
  load for the crawl target. (Default: `0.0`)
 - `windows_paths`: Whether PFERD should find alternative names for paths that
  are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
- `aliases`: List of strings that are considered as an alias when invoking with
-  the `--crawler` or `-C` flag. If there is more than one crawl section with
-  the same aliases all are selected. Thereby, you can group different crawlers.

 Some crawlers may also require credentials for authentication. To configure how
 the crawler obtains its credentials, the `auth` option is used. It is set to the
@@ -109,7 +106,6 @@ username = foo
 password = bar

 [crawl:something]
-aliases = [sth, some]
 type = some-complex-crawler
 auth = auth:example
 on_conflict = no-delete
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -17,7 +17,7 @@ TargetType = Union[str, int]
 class IliasElementType(Enum):
    EXERCISE = "exercise"
    EXERCISE_FILES = "exercise_files"  # own submitted files
-    TEST = "test"                      # an online test. Will be ignored currently.
+    TEST = "test"  # an online test. Will be ignored currently.
    FILE = "file"
    FOLDER = "folder"
    FORUM = "forum"
@@ -95,13 +95,9 @@ class IliasPage:

    @staticmethod
    def is_root_page(soup: BeautifulSoup) -> bool:
-        permalink = soup.find(id="current_perma_link")
-        if permalink is None:
-            return False
-        value = permalink.attrs.get("value")
-        if value is None:
-            return False
-        return "goto.php?target=root_" in value
+        if permalink := IliasPage.get_soup_permalink(soup):
+            return "goto.php?target=root_" in permalink
+        return False

    def get_child_elements(self) -> List[IliasPageElement]:
        """
@@ -279,16 +275,14 @@ class IliasPage:
        return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})

    def _is_content_page(self) -> bool:
-        link = self._soup.find(id="current_perma_link")
-        if not link:
-            return False
-        return "target=copa_" in link.get("value")
+        if link := self.get_permalink():
+            return "target=copa_" in link
+        return False

    def _is_learning_module_page(self) -> bool:
-        link = self._soup.find(id="current_perma_link")
-        if not link:
-            return False
-        return "target=pg_" in link.get("value")
+        if link := self.get_permalink():
+            return "target=pg_" in link
+        return False

    def _contains_collapsed_future_meetings(self) -> bool:
        return self._uncollapse_future_meetings_url() is not None
@@ -384,6 +378,10 @@ class IliasPage:
            name = _sanitize_path_name(link.text.strip())
            url = self._abs_url_from_link(link)

+            if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
+                # Configure button/link does not have anything interesting
+                continue
+
            type = self._find_type_from_link(name, link, url)
            if not type:
                _unexpected_html_warning()
@@ -513,8 +511,8 @@ class IliasPage:
            modification_string = link.parent.parent.parent.select_one(
                f"td.std:nth-child({index})"
            ).getText().strip()
-            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
-                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
+            if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
+                modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
                break

        if modification_time is None:
@@ -613,7 +611,7 @@ class IliasPage:
            file_listings: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
-                attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
+                attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
            )

            # Add each listing as a new
@@ -917,9 +915,9 @@ class IliasPage:

    @staticmethod
    def _find_type_from_link(
-            element_name: str,
-            link_element: Tag,
-            url: str
+        element_name: str,
+        link_element: Tag,
+        url: str
    ) -> Optional[IliasElementType]:
        """
        Decides which sub crawler to use for a given top level element.
@@ -1080,6 +1078,14 @@ class IliasPage:
        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
            return True

+        # Empty personal desktop has zero (0) markers. Match on the text...
+        if alert := soup.select_one(".alert-info"):
+            text = alert.getText().lower()
+            if "you have not yet selected any favourites" in text:
+                return True
+            if "sie haben aktuell noch keine favoriten ausgewählt" in text:
+                return True
+
        # Video listing embeds do not have complete ILIAS html. Try to match them by
        # their video listing table
        video_table = soup.find(
@@ -1095,6 +1101,9 @@ class IliasPage:
            return True
        return False

+    def get_permalink(self) -> Optional[str]:
+        return IliasPage.get_soup_permalink(self._soup)
+
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        """
        Create an absolute url from an <a> tag.
@@ -1107,6 +1116,13 @@ class IliasPage:
        """
        return urljoin(self._page_url, relative_url)

+    @staticmethod
+    def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
+        perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
+        if not perma_link_element or not perma_link_element.get("href"):
+            return None
+        return perma_link_element.get("href")
+

 def _unexpected_html_warning() -> None:
    log.warn("Encountered unexpected HTML structure, ignoring element.")
@@ -1130,7 +1146,7 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti

        date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
        date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
-        date_str = re.sub("Morgen|Tomorrow",  _format_date_english(_tomorrow()), date_str, re.I)
+        date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
        date_str = date_str.strip()
        for german, english in zip(german_months, english_months):
            date_str = date_str.replace(german, english)
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
            raise CrawlError("Impossible return in ilias _iorepeat")

        return wrapper  # type: ignore
+
    return decorator


@@ -177,11 +178,11 @@ def _get_video_cache_key(element: IliasPageElement) -> str:

 class KitIliasWebCrawler(HttpCrawler):
    def __init__(
-            self,
-            name: str,
-            section: KitIliasWebCrawlerSection,
-            config: Config,
-            authenticators: Dict[str, Authenticator]
+        self,
+        name: str,
+        section: KitIliasWebCrawlerSection,
+        config: Config,
+        authenticators: Dict[str, Authenticator]
    ):
        # Setting a main authenticator for cookie sharing
        auth = section.auth(authenticators)
@@ -227,7 +228,7 @@ instance's greatest bottleneck.
        await self._crawl_url(root_url, expected_id=course_id)

    async def _crawl_desktop(self) -> None:
-        appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
+        appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items"
        appendix = appendix.encode("ASCII").hex()
        await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)

@@ -253,8 +254,8 @@ instance's greatest bottleneck.
                    soup = await self._get_page(next_stage_url, root_page_allowed=True)

                    if current_parent is None and expected_id is not None:
-                        perma_link_element: Tag = soup.find(id="current_perma_link")
-                        if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+                        perma_link = IliasPage.get_soup_permalink(soup)
+                        if not perma_link or "crs_" not in perma_link:
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")

                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -674,12 +675,28 @@ instance's greatest bottleneck.

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
        async def try_stream() -> bool:
-            async with self.session.get(url, allow_redirects=is_video) as resp:
-                if not is_video:
-                    # Redirect means we weren't authenticated
+            next_url = url
+
+            # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
+            # we can not match on the content type here. Instead, we disallow redirects and inspect the
+            # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
+            # our authentication expired.
+            if not is_video:
+                async with self.session.get(url, allow_redirects=False) as resp:
+                    # Redirect to anything except a "sendfile" means we weren't authenticated
                    if hdrs.LOCATION in resp.headers:
-                        return False
-                # we wanted a video but got HTML
+                        if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
+                            return False
+                        # Directly follow the redirect to not make a second, unnecessary request
+                        next_url = resp.headers[hdrs.LOCATION]
+
+            # Let's try this again and follow redirects
+            return await fetch_follow_redirects(next_url)
+
+        async def fetch_follow_redirects(file_url: str) -> bool:
+            async with self.session.get(file_url) as resp:
+                # We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
+                # solve that depending on the setup, but it is better than nothing.
                if is_video and "html" in resp.content_type:
                    return False

@@ -1052,9 +1069,9 @@ class KitShibbolethLogin:
        await sess.post(url, data=data)

    async def _authenticate_tfa(
-            self,
-            session: aiohttp.ClientSession,
-            soup: BeautifulSoup
+        self,
+        session: aiohttp.ClientSession,
+        soup: BeautifulSoup
    ) -> BeautifulSoup:
        if not self._tfa_auth:
            self._tfa_auth = TfaAuthenticator("ilias-anon-tfa")
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional

 from rich.markup import escape

@@ -43,24 +43,16 @@ class Pferd:

        crawl_sections = [name for name, _ in config.crawl_sections()]

-        crawlers_to_run = set()  # With crawl: prefix
+        crawlers_to_run = []  # With crawl: prefix
        unknown_names = []  # Without crawl: prefix

        for name in cli_crawlers:
            section_name = f"crawl:{name}"
            if section_name in crawl_sections:
                log.explain(f"Crawler section named {section_name!r} exists")
-                crawlers_to_run.add(section_name)
-            # interprete name as alias of a crawler
-            alias_names = self._find_crawlers_by_alias(name, config)
-            if alias_names:
-                crawlers_to_run.update(alias_names)
-                log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
-                for alias_name in alias_names:
-                    log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
-
-            if not section_name in crawl_sections and not alias_names:
-                log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
+                crawlers_to_run.append(section_name)
+            else:
+                log.explain(f"There's no crawler section named {section_name!r}")
                unknown_names.append(name)

        if unknown_names:
@@ -73,14 +65,6 @@ class Pferd:

        return crawlers_to_run

-    def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
-        alias_names = set()
-        for (section_name, section) in config.crawl_sections():
-            section_aliases = section.get("aliases", [])
-            if alias in section_aliases:
-                alias_names.add(section_name)
-        return alias_names
-
    def _find_crawlers_to_run(
            self,
            config: Config,
--- a/PFERD/version.py
+++ b/PFERD/version.py
@@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.5.0"
+VERSION = "3.5.2"
--- a/flake.lock
+++ b/flake.lock
@@ -2,16 +2,16 @@
  "nodes": {
    "nixpkgs": {
      "locked": {
-        "lastModified": 1694499547,
-        "narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
+        "lastModified": 1708979614,
+        "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
+        "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-23.05",
+        "ref": "nixos-23.11",
        "repo": "nixpkgs",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Tool for downloading course-related files from ILIAS";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
  };

  outputs = { self, nixpkgs }:
Author	SHA1	Message	Date
Joscha	eb01aa86cb	Bump version to 3.5.2	2024-04-14 12:10:17 +02:00
I-Al-Istannen	3db186a978	Fix personal desktop crawling HTML warnings	2024-04-10 11:15:25 +02:00
I-Al-Istannen	4a5959fd58	Fix personal desktop crawling without favorites	2024-04-10 11:15:25 +02:00
I-Al-Istannen	1cbc2b717a	Fix personal desktop crawling with ILIAS 8	2024-04-10 01:20:37 +02:00
Joscha	da627ff929	Bump version to 3.5.1	2024-04-09 14:28:56 +02:00
I-Al-Istannen	c1b592ac29	Fix ILIAS 8 file downloads truncating to zero bytes	2024-04-08 17:59:41 +02:00
I-Al-Istannen	eb0c956d32	Add compatibility with ILIAS 8	2024-04-05 19:08:05 +02:00
TornaxO7	ab0cb2d956	nix: bump nixpgs dependency	2024-02-27 23:39:53 +01:00