Print mtime before updating file metadata

2025-10-19 08:12:33 +02:00 · 2023-09-23 13:01:58 +02:00
5 changed files with 40 additions and 57 deletions
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -92,9 +92,6 @@ common to all crawlers:
  load for the crawl target. (Default: `0.0`)
 - `windows_paths`: Whether PFERD should find alternative names for paths that
  are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
- `aliases`: List of strings that are considered as an alias when invoking with
-  the `--crawler` or `-C` flag. If there is more than one crawl section with
-  the same aliases all are selected. Thereby, you can group different crawlers.

 Some crawlers may also require credentials for authentication. To configure how
 the crawler obtains its credentials, the `auth` option is used. It is set to the
@@ -109,7 +106,6 @@ username = foo
 password = bar

 [crawl:something]
-aliases = [sth, some]
 type = some-complex-crawler
 auth = auth:example
 on_conflict = no-delete
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -514,8 +514,11 @@ class IliasPage:
                f"td.std:nth-child({index})"
            ).getText().strip()
            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
+                log.explain(f"Converting {modification_string!r}")
                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
                break
+            else:
+                log.explain(f"Date has wrong format: {modification_string!r}")

        if modification_time is None:
            log.warn(f"Could not determine upload time for {link}")
@@ -1067,34 +1070,6 @@ class IliasPage:
        rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:])
        return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name

-    @staticmethod
-    def is_logged_in(soup: BeautifulSoup) -> bool:
-        # Normal ILIAS pages
-        mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
-        if mainbar is not None:
-            login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
-            shib_login = soup.find(id="button_shib_login")
-            return not login_button and not shib_login
-
-        # Personal Desktop
-        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
-            return True
-
-        # Video listing embeds do not have complete ILIAS html. Try to match them by
-        # their video listing table
-        video_table = soup.find(
-            recursive=True,
-            name="table",
-            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
-        )
-        if video_table is not None:
-            return True
-        # The individual video player wrapper page has nothing of the above.
-        # Match it by its playerContainer.
-        if soup.select_one("#playerContainer") is not None:
-            return True
-        return False
-
    def _abs_url_from_link(self, link_tag: Tag) -> str:
        """
        Create an absolute url from an <a> tag.
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -894,7 +894,7 @@ instance's greatest bottleneck.
        auth_id = await self._current_auth_id()
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
-            if IliasPage.is_logged_in(soup):
+            if self._is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)

        # We weren't authenticated, so try to do that
@@ -903,12 +903,11 @@ instance's greatest bottleneck.
        # Retry once after authenticating. If this fails, we will die.
        async with self.session.get(url) as request:
            soup = soupify(await request.read())
-            if IliasPage.is_logged_in(soup):
+            if self._is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)
        raise CrawlError(f"get_page failed even after authenticating on {url!r}")

-    @staticmethod
-    def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
+    def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
        if IliasPage.is_root_page(soup) and not root_page_allowed:
            raise CrawlError(
                "Unexpectedly encountered ILIAS root page. "
@@ -966,6 +965,34 @@ instance's greatest bottleneck.
    async def _authenticate(self) -> None:
        await self._shibboleth_login.login(self.session)

+    @ staticmethod
+    def _is_logged_in(soup: BeautifulSoup) -> bool:
+        # Normal ILIAS pages
+        mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
+        if mainbar is not None:
+            login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
+            shib_login = soup.find(id="button_shib_login")
+            return not login_button and not shib_login
+
+        # Personal Desktop
+        if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
+            return True
+
+        # Video listing embeds do not have complete ILIAS html. Try to match them by
+        # their video listing table
+        video_table = soup.find(
+            recursive=True,
+            name="table",
+            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
+        )
+        if video_table is not None:
+            return True
+        # The individual video player wrapper page has nothing of the above.
+        # Match it by its playerContainer.
+        if soup.select_one("#playerContainer") is not None:
+            return True
+        return False
+

 class KitShibbolethLogin:
    """
--- a/PFERD/output_dir.py
+++ b/PFERD/output_dir.py
@@ -415,6 +415,7 @@ class OutputDirectory:

    def _update_metadata(self, info: DownloadInfo) -> None:
        if mtime := info.heuristics.mtime:
+            log.explain(f"Setting mtime to {mtime}")
            mtimestamp = mtime.timestamp()
            os.utime(info.local_path, times=(mtimestamp, mtimestamp))

--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional

 from rich.markup import escape

@@ -43,24 +43,16 @@ class Pferd:

        crawl_sections = [name for name, _ in config.crawl_sections()]

-        crawlers_to_run = set()  # With crawl: prefix
+        crawlers_to_run = []  # With crawl: prefix
        unknown_names = []  # Without crawl: prefix

        for name in cli_crawlers:
            section_name = f"crawl:{name}"
            if section_name in crawl_sections:
                log.explain(f"Crawler section named {section_name!r} exists")
-                crawlers_to_run.add(section_name)
-            # interprete name as alias of a crawler
-            alias_names = self._find_crawlers_by_alias(name, config)
-            if alias_names:
-                crawlers_to_run.update(alias_names)
-                log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
-                for alias_name in alias_names:
-                    log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
-
-            if not section_name in crawl_sections and not alias_names:
-                log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
+                crawlers_to_run.append(section_name)
+            else:
+                log.explain(f"There's no crawler section named {section_name!r}")
                unknown_names.append(name)

        if unknown_names:
@@ -73,14 +65,6 @@ class Pferd:

        return crawlers_to_run

-    def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
-        alias_names = set()
-        for (section_name, section) in config.crawl_sections():
-            section_aliases = section.get("aliases", [])
-            if alias in section_aliases:
-                alias_names.add(section_name)
-        return alias_names
-
    def _find_crawlers_to_run(
            self,
            config: Config,