Merge 77c1f1516c into 266812f90e

Move is_logged_in helper to kit_ilias_html
Used proper plural
2023-12-21 10:23:01 +01:00 · 2023-11-16 05:51:50 -07:00 · 2023-11-16 11:19:20 +01:00 · 2021-11-02 12:41:40 +01:00 · 2021-11-02 03:42:08 +01:00
4 changed files with 57 additions and 36 deletions
--- a/CONFIG.md
+++ b/CONFIG.md
@ -92,6 +92,9 @@ common to all crawlers:
  load for the crawl target. (Default: `0.0`)
 - `windows_paths`: Whether PFERD should find alternative names for paths that
  are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
+- `aliases`: List of strings that are considered as an alias when invoking with
+  the `--crawler` or `-C` flag. If there is more than one crawl section with
+  the same aliases all are selected. Thereby, you can group different crawlers.

 Some crawlers may also require credentials for authentication. To configure how
 the crawler obtains its credentials, the `auth` option is used. It is set to the
@ -106,6 +109,7 @@ username = foo
 password = bar

 [crawl:something]
+aliases = [sth, some]
 type = some-complex-crawler
 auth = auth:example
 on_conflict = no-delete
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -1067,6 +1067,34 @@ class IliasPage:
        rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:])
        return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name

+    @staticmethod
+    def is_logged_in(soup: BeautifulSoup) -> bool:
+        # Normal ILIAS pages
+        mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
+        if mainbar is not None:
+            login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
+            shib_login = soup.find(id="button_shib_login")
+            return not login_button and not shib_login
+
+        # Personal Desktop
+        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
+            return True
+
+        # Video listing embeds do not have complete ILIAS html. Try to match them by
+        # their video listing table
+        video_table = soup.find(
+            recursive=True,
+            name="table",
+            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
+        )
+        if video_table is not None:
+            return True
+        # The individual video player wrapper page has nothing of the above.
+        # Match it by its playerContainer.
+        if soup.select_one("#playerContainer") is not None:
+            return True
+        return False
+
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        """
        Create an absolute url from an <a> tag.
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -894,7 +894,7 @@ instance's greatest bottleneck.
        auth_id = await self._current_auth_id()
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
-            if self._is_logged_in(soup):
+            if IliasPage.is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)

        # We weren't authenticated, so try to do that
@ -903,11 +903,12 @@ instance's greatest bottleneck.
        # Retry once after authenticating. If this fails, we will die.
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
-            if self._is_logged_in(soup):
+            if IliasPage.is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)
        raise CrawlError(f"get_page failed even after authenticating on {url!r}")

-    def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
+    @staticmethod
+    def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
        if IliasPage.is_root_page(soup) and not root_page_allowed:
            raise CrawlError(
                "Unexpectedly encountered ILIAS root page. "
@ -965,34 +966,6 @@ instance's greatest bottleneck.
    async def _authenticate(self) -> None:
        await self._shibboleth_login.login(self.session)

-    @ staticmethod
-    def _is_logged_in(soup: BeautifulSoup) -> bool:
-        # Normal ILIAS pages
-        mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
-        if mainbar is not None:
-            login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
-            shib_login = soup.find(id="button_shib_login")
-            return not login_button and not shib_login
-
-        # Personal Desktop
-        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
-            return True
-
-        # Video listing embeds do not have complete ILIAS html. Try to match them by
-        # their video listing table
-        video_table = soup.find(
-            recursive=True,
-            name="table",
-            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
-        )
-        if video_table is not None:
-            return True
-        # The individual video player wrapper page has nothing of the above.
-        # Match it by its playerContainer.
-        if soup.select_one("#playerContainer") is not None:
-            return True
-        return False
-

 class KitShibbolethLogin:
    """
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Set

 from rich.markup import escape

@ -43,16 +43,24 @@ class Pferd:

        crawl_sections = [name for name, _ in config.crawl_sections()]

-        crawlers_to_run = []  # With crawl: prefix
+        crawlers_to_run = set()  # With crawl: prefix
        unknown_names = []  # Without crawl: prefix

        for name in cli_crawlers:
            section_name = f"crawl:{name}"
            if section_name in crawl_sections:
                log.explain(f"Crawler section named {section_name!r} exists")
-                crawlers_to_run.append(section_name)
-            else:
-                log.explain(f"There's no crawler section named {section_name!r}")
+                crawlers_to_run.add(section_name)
+            # interprete name as alias of a crawler
+            alias_names = self._find_crawlers_by_alias(name, config)
+            if alias_names:
+                crawlers_to_run.update(alias_names)
+                log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
+                for alias_name in alias_names:
+                    log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
+
+            if not section_name in crawl_sections and not alias_names:
+                log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
                unknown_names.append(name)

        if unknown_names:
@ -65,6 +73,14 @@ class Pferd:

        return crawlers_to_run

+    def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
+        alias_names = set()
+        for (section_name, section) in config.crawl_sections():
+            section_aliases = section.get("aliases", [])
+            if alias in section_aliases:
+                alias_names.add(section_name)
+        return alias_names
+
    def _find_crawlers_to_run(
            self,
            config: Config,
Author	SHA1	Message	Date
Julius Rüberg	b1d42f8b70	Merge `77c1f1516c` into `266812f90e`	2023-11-16 05:51:50 -07:00
I-Al-Istannen	266812f90e	Move is_logged_in helper to kit_ilias_html	2023-11-16 11:19:20 +01:00
Julius Rüberg	77c1f1516c	Used proper plural	2021-11-02 12:41:40 +01:00
Julius Rüberg	9e12e96d90	Added alias functionality	2021-11-02 03:42:08 +01:00