Merge 77c1f1516c into a117126389

Used proper plural
Added alias functionality
2026-01-10 06:22:30 +01:00 · 2023-12-09 22:34:12 -07:00 · 2021-11-02 12:41:40 +01:00 · 2021-11-02 03:42:08 +01:00
8 changed files with 70 additions and 94 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,17 +22,6 @@ ambiguous situations.

 ## Unreleased

-## 3.5.2 - 2024-04-14
-
-### Fixed
- Crawling of personal desktop with ILIAS 8
- Crawling of empty personal desktops
-
-## 3.5.1 - 2024-04-09
-
-### Added
- Support for ILIAS 8
-
 ### Fixed
 - Video name deduplication

--- a/CONFIG.md
+++ b/CONFIG.md
@@ -92,6 +92,9 @@ common to all crawlers:
  load for the crawl target. (Default: `0.0`)
 - `windows_paths`: Whether PFERD should find alternative names for paths that
  are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
+- `aliases`: List of strings that are considered as an alias when invoking with
+  the `--crawler` or `-C` flag. If there is more than one crawl section with
+  the same aliases all are selected. Thereby, you can group different crawlers.

 Some crawlers may also require credentials for authentication. To configure how
 the crawler obtains its credentials, the `auth` option is used. It is set to the
@@ -106,6 +109,7 @@ username = foo
 password = bar

 [crawl:something]
+aliases = [sth, some]
 type = some-complex-crawler
 auth = auth:example
 on_conflict = no-delete
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -95,9 +95,13 @@ class IliasPage:

    @staticmethod
    def is_root_page(soup: BeautifulSoup) -> bool:
-        if permalink := IliasPage.get_soup_permalink(soup):
-            return "goto.php?target=root_" in permalink
+        permalink = soup.find(id="current_perma_link")
+        if permalink is None:
            return False
+        value = permalink.attrs.get("value")
+        if value is None:
+            return False
+        return "goto.php?target=root_" in value

    def get_child_elements(self) -> List[IliasPageElement]:
        """
@@ -275,14 +279,16 @@ class IliasPage:
        return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})

    def _is_content_page(self) -> bool:
-        if link := self.get_permalink():
-            return "target=copa_" in link
+        link = self._soup.find(id="current_perma_link")
+        if not link:
            return False
+        return "target=copa_" in link.get("value")

    def _is_learning_module_page(self) -> bool:
-        if link := self.get_permalink():
-            return "target=pg_" in link
+        link = self._soup.find(id="current_perma_link")
+        if not link:
            return False
+        return "target=pg_" in link.get("value")

    def _contains_collapsed_future_meetings(self) -> bool:
        return self._uncollapse_future_meetings_url() is not None
@@ -378,10 +384,6 @@ class IliasPage:
            name = _sanitize_path_name(link.text.strip())
            url = self._abs_url_from_link(link)

-            if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
-                # Configure button/link does not have anything interesting
-                continue
-
            type = self._find_type_from_link(name, link, url)
            if not type:
                _unexpected_html_warning()
@@ -511,8 +513,8 @@ class IliasPage:
            modification_string = link.parent.parent.parent.select_one(
                f"td.std:nth-child({index})"
            ).getText().strip()
-            if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
-                modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
+            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
+                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
                break

        if modification_time is None:
@@ -611,7 +613,7 @@ class IliasPage:
            file_listings: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
-                attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
+                attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
            )

            # Add each listing as a new
@@ -1078,14 +1080,6 @@ class IliasPage:
        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
            return True

-        # Empty personal desktop has zero (0) markers. Match on the text...
-        if alert := soup.select_one(".alert-info"):
-            text = alert.getText().lower()
-            if "you have not yet selected any favourites" in text:
-                return True
-            if "sie haben aktuell noch keine favoriten ausgewählt" in text:
-                return True
-
        # Video listing embeds do not have complete ILIAS html. Try to match them by
        # their video listing table
        video_table = soup.find(
@@ -1101,9 +1095,6 @@ class IliasPage:
            return True
        return False

-    def get_permalink(self) -> Optional[str]:
-        return IliasPage.get_soup_permalink(self._soup)
-
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        """
        Create an absolute url from an <a> tag.
@@ -1116,13 +1107,6 @@ class IliasPage:
        """
        return urljoin(self._page_url, relative_url)

-    @staticmethod
-    def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
-        perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
-        if not perma_link_element or not perma_link_element.get("href"):
-            return None
-        return perma_link_element.get("href")
-

 def _unexpected_html_warning() -> None:
    log.warn("Encountered unexpected HTML structure, ignoring element.")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -130,7 +130,6 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
            raise CrawlError("Impossible return in ilias _iorepeat")

        return wrapper  # type: ignore
-
    return decorator


@@ -228,7 +227,7 @@ instance's greatest bottleneck.
        await self._crawl_url(root_url, expected_id=course_id)

    async def _crawl_desktop(self) -> None:
-        appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items"
+        appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
        appendix = appendix.encode("ASCII").hex()
        await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)

@@ -254,8 +253,8 @@ instance's greatest bottleneck.
                    soup = await self._get_page(next_stage_url, root_page_allowed=True)

                    if current_parent is None and expected_id is not None:
-                        perma_link = IliasPage.get_soup_permalink(soup)
-                        if not perma_link or "crs_" not in perma_link:
+                        perma_link_element: Tag = soup.find(id="current_perma_link")
+                        if not perma_link_element or "crs_" not in perma_link_element.get("value"):
                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")

                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -675,28 +674,12 @@ instance's greatest bottleneck.

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
        async def try_stream() -> bool:
-            next_url = url
-
-            # Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
-            # we can not match on the content type here. Instead, we disallow redirects and inspect the
-            # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
-            # our authentication expired.
+            async with self.session.get(url, allow_redirects=is_video) as resp:
                if not is_video:
-                async with self.session.get(url, allow_redirects=False) as resp:
-                    # Redirect to anything except a "sendfile" means we weren't authenticated
+                    # Redirect means we weren't authenticated
                    if hdrs.LOCATION in resp.headers:
-                        if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
                        return False
-                        # Directly follow the redirect to not make a second, unnecessary request
-                        next_url = resp.headers[hdrs.LOCATION]
-
-            # Let's try this again and follow redirects
-            return await fetch_follow_redirects(next_url)
-
-        async def fetch_follow_redirects(file_url: str) -> bool:
-            async with self.session.get(file_url) as resp:
-                # We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
-                # solve that depending on the setup, but it is better than nothing.
+                # we wanted a video but got HTML
                if is_video and "html" in resp.content_type:
                    return False

--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Set

 from rich.markup import escape

@@ -43,16 +43,24 @@ class Pferd:

        crawl_sections = [name for name, _ in config.crawl_sections()]

-        crawlers_to_run = []  # With crawl: prefix
+        crawlers_to_run = set()  # With crawl: prefix
        unknown_names = []  # Without crawl: prefix

        for name in cli_crawlers:
            section_name = f"crawl:{name}"
            if section_name in crawl_sections:
                log.explain(f"Crawler section named {section_name!r} exists")
-                crawlers_to_run.append(section_name)
-            else:
-                log.explain(f"There's no crawler section named {section_name!r}")
+                crawlers_to_run.add(section_name)
+            # interprete name as alias of a crawler
+            alias_names = self._find_crawlers_by_alias(name, config)
+            if alias_names:
+                crawlers_to_run.update(alias_names)
+                log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
+                for alias_name in alias_names:
+                    log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
+
+            if not section_name in crawl_sections and not alias_names:
+                log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
                unknown_names.append(name)

        if unknown_names:
@@ -65,6 +73,14 @@ class Pferd:

        return crawlers_to_run

+    def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
+        alias_names = set()
+        for (section_name, section) in config.crawl_sections():
+            section_aliases = section.get("aliases", [])
+            if alias in section_aliases:
+                alias_names.add(section_name)
+        return alias_names
+
    def _find_crawlers_to_run(
            self,
            config: Config,
--- a/PFERD/version.py
+++ b/PFERD/version.py
@@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.5.2"
+VERSION = "3.5.0"
--- a/flake.lock
+++ b/flake.lock
@@ -2,16 +2,16 @@
  "nodes": {
    "nixpkgs": {
      "locked": {
-        "lastModified": 1708979614,
-        "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
+        "lastModified": 1694499547,
+        "narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
+        "rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-23.11",
+        "ref": "nixos-23.05",
        "repo": "nixpkgs",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Tool for downloading course-related files from ILIAS";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
  };

  outputs = { self, nixpkgs }:
Author	SHA1	Message	Date
Julius Rüberg	42098dc3a5	Merge `77c1f1516c` into `a117126389`	2023-12-09 22:34:12 -07:00
Julius Rüberg	77c1f1516c	Used proper plural	2021-11-02 12:41:40 +01:00
Julius Rüberg	9e12e96d90	Added alias functionality	2021-11-02 03:42:08 +01:00