Print mtime before updating file metadata

Bump version to 3.5.0
Update flake.lock
2025-12-19 13:12:29 +01:00 · 2023-09-23 13:01:58 +02:00 · 2023-09-13 23:13:30 +02:00 · 2023-09-13 22:23:36 +02:00 · 2023-09-13 22:23:36 +02:00 · 2023-08-29 13:51:19 +02:00
8 changed files with 105 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 /PFERD.egg-info/
 __pycache__/
 /.vscode/
+/.idea/

 # pyinstaller
 /pferd.spec
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,20 +22,28 @@ ambiguous situations.

 ## Unreleased

+## 3.5.0 - 2023-09-13
+
+### Added
+- `no-delete-prompt-override` conflict resolution strategy
+- Support for ILIAS learning modules
+- `show_not_deleted` option to stop printing the "Not Deleted" status or report
+  message. This combines nicely with the `no-delete-prompt-override` strategy,
+  causing PFERD to mostly ignore local-only files.
+- Support for mediacast video listings
+- Crawling of files in info tab
+
+### Changed
+- Remove size suffix for files in content pages
+
 ### Fixed
 - Crawling of courses with the timeline view as the default tab
 - Crawling of file and custom opencast cards
 - Crawling of button cards without descriptions
 - Abort crawling when encountering an unexpected ilias root page redirect
- Remove size suffix for files in content pages
-
-### Added
- `no-delete-prompt-override` conflict resolution strategy
- support for ILIAS learning modules
- `show_not_deleted` option to stop printing the "Not Deleted" status or report
-  message. This combines nicely with the `no-delete-prompt-override` strategy,
-  causing PFERD to mostly ignore local-only files.
- support for mediacast video listings
+- Sanitize ascii control characters on Windows
+- Crawling of paginated past meetings
+- Ignore SCORM learning modules

 ## 3.4.3 - 2022-11-29

--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -22,10 +22,12 @@ class IliasElementType(Enum):
    FOLDER = "folder"
    FORUM = "forum"
    LINK = "link"
+    INFO_TAB = "info_tab"
    LEARNING_MODULE = "learning_module"
    BOOKING = "booking"
    MEETING = "meeting"
    SURVEY = "survey"
+    SCORM_LEARNING_MODULE = "scorm_learning_module"
    MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
    MEDIACAST_VIDEO = "mediacast_video"
    OPENCAST_VIDEO = "opencast_video"
@@ -120,9 +122,25 @@ class IliasPage:
        if self._is_content_page():
            log.explain("Page is a content page, searching for elements")
            return self._find_copa_entries()
+        if self._is_info_tab():
+            log.explain("Page is info tab, searching for elements")
+            return self._find_info_tab_entries()
        log.explain("Page is a normal folder, searching for elements")
        return self._find_normal_entries()

+    def get_info_tab(self) -> Optional[IliasPageElement]:
+        tab: Optional[Tag] = self._soup.find(
+            name="a",
+            attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x}
+        )
+        if tab is not None:
+            return IliasPageElement(
+                IliasElementType.INFO_TAB,
+                self._abs_url_from_link(tab),
+                "infos"
+            )
+        return None
+
    def get_description(self) -> Optional[BeautifulSoup]:
        def is_interesting_class(name: str) -> bool:
            return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
@@ -209,7 +227,11 @@ class IliasPage:
            log.explain("Requesting *all* future meetings")
            return self._uncollapse_future_meetings_url()
        if not self._is_content_tab_selected():
-            return self._select_content_page_url()
+            if self._page_type != IliasElementType.INFO_TAB:
+                log.explain("Selecting content tab")
+                return self._select_content_page_url()
+            else:
+                log.explain("Crawling info tab, skipping content select")
        return None

    def _is_forum_page(self) -> bool:
@@ -272,7 +294,10 @@ class IliasPage:
        return self._uncollapse_future_meetings_url() is not None

    def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
-        element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x})
+        element = self._soup.find(
+            "a",
+            attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)}
+        )
        if not element:
            return None
        link = self._abs_url_from_link(element)
@@ -281,6 +306,10 @@ class IliasPage:
    def _is_content_tab_selected(self) -> bool:
        return self._select_content_page_url() is None

+    def _is_info_tab(self) -> bool:
+        might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None
+        return self._page_type == IliasElementType.INFO_TAB and might_be_info
+
    def _select_content_page_url(self) -> Optional[IliasPageElement]:
        tab = self._soup.find(
            id="tab_view_content",
@@ -389,6 +418,23 @@ class IliasPage:

        return items

+    def _find_info_tab_entries(self) -> List[IliasPageElement]:
+        items = []
+        links: List[Tag] = self._soup.select("a.il_ContainerItemCommand")
+
+        for link in links:
+            if "cmdClass=ilobjcoursegui" not in link["href"]:
+                continue
+            if "cmd=sendfile" not in link["href"]:
+                continue
+            items.append(IliasPageElement(
+                IliasElementType.FILE,
+                self._abs_url_from_link(link),
+                _sanitize_path_name(link.getText())
+            ))
+
+        return items
+
    def _find_opencast_video_entries(self) -> List[IliasPageElement]:
        # ILIAS has three stages for video pages
        # 1. The initial dummy page without any videos. This page contains the link to the listing
@@ -468,8 +514,11 @@ class IliasPage:
                f"td.std:nth-child({index})"
            ).getText().strip()
            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
+                log.explain(f"Converting {modification_string!r}")
                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
                break
+            else:
+                log.explain(f"Date has wrong format: {modification_string!r}")

        if modification_time is None:
            log.warn(f"Could not determine upload time for {link}")
@@ -908,6 +957,9 @@ class IliasPage:
        if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
            return IliasElementType.MEDIACAST_VIDEO_FOLDER

+        if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
+            return IliasElementType.SCORM_LEARNING_MODULE
+
        # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
        # try to guess it from the image.

@@ -949,7 +1001,11 @@ class IliasPage:
        if img_tag is None:
            img_tag = found_parent.select_one("img.icon")

-        if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}):
+        is_session_expansion_button = found_parent.find(
+            "a",
+            attrs={"href": lambda x: x and ("crs_next_sess=" in x or "crs_prev_sess=" in x)}
+        )
+        if img_tag is None and is_session_expansion_button:
            log.explain("Found session expansion button, skipping it as it has no content")
            return None

@@ -982,6 +1038,9 @@ class IliasPage:
        if str(img_tag["src"]).endswith("icon_mcst.svg"):
            return IliasElementType.MEDIACAST_VIDEO_FOLDER

+        if str(img_tag["src"]).endswith("icon_sahs.svg"):
+            return IliasElementType.SCORM_LEARNING_MODULE
+
        return IliasElementType.FOLDER

    @staticmethod
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -85,6 +85,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
    IliasElementType.EXERCISE,
    IliasElementType.EXERCISE_FILES,
    IliasElementType.FOLDER,
+    IliasElementType.INFO_TAB,
    IliasElementType.MEETING,
    IliasElementType.MEDIACAST_VIDEO_FOLDER,
    IliasElementType.OPENCAST_VIDEO_FOLDER,
@@ -262,6 +263,8 @@ instance's greatest bottleneck.
                        next_stage_url = None

                elements.extend(page.get_child_elements())
+                if info_tab := page.get_info_tab():
+                    elements.append(info_tab)
                if description_string := page.get_description():
                    description.append(description_string)

@@ -400,6 +403,14 @@ instance's greatest bottleneck.
                "[bright_black](surveys contain no relevant data)"
            )
            return None
+        elif element.type == IliasElementType.SCORM_LEARNING_MODULE:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](scorm learning modules are not supported)"
+            )
+            return None
        elif element.type == IliasElementType.LEARNING_MODULE:
            return await self._handle_learning_module(element, element_path)
        elif element.type == IliasElementType.LINK:
@@ -705,7 +716,7 @@ instance's greatest bottleneck.
                log.explain(f"URL: {next_stage_url}")

                soup = await self._get_page(next_stage_url)
-                page = IliasPage(soup, next_stage_url, None)
+                page = IliasPage(soup, next_stage_url, element)

                if next := page.get_next_stage_element():
                    next_stage_url = next.url
@@ -768,14 +779,14 @@ instance's greatest bottleneck.
            log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
            log.explain(f"URL: {element.url}")
            soup = await self._get_page(element.url)
-            page = IliasPage(soup, element.url, None)
+            page = IliasPage(soup, element.url, element)
            if next := page.get_learning_module_data():
                elements.extend(await self._crawl_learning_module_direction(
-                    cl.path, next.previous_url, "left"
+                    cl.path, next.previous_url, "left", element
                ))
                elements.append(next)
                elements.extend(await self._crawl_learning_module_direction(
-                    cl.path, next.next_url, "right"
+                    cl.path, next.next_url, "right", element
                ))

        # Reflect their natural ordering in the file names
@@ -797,7 +808,8 @@ instance's greatest bottleneck.
        self,
        path: PurePath,
        start_url: Optional[str],
-        dir: Union[Literal["left"], Literal["right"]]
+        dir: Union[Literal["left"], Literal["right"]],
+        parent_element: IliasPageElement
    ) -> List[IliasLearningModulePage]:
        elements: List[IliasLearningModulePage] = []

@@ -810,7 +822,7 @@ instance's greatest bottleneck.
            log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
            log.explain(f"URL: {next_element_url}")
            soup = await self._get_page(next_element_url)
-            page = IliasPage(soup, next_element_url, None)
+            page = IliasPage(soup, next_element_url, parent_element)
            if next := page.get_learning_module_data():
                elements.append(next)
                if dir == "left":
@@ -893,7 +905,7 @@ instance's greatest bottleneck.
            soup = soupify(await request.read())
            if self._is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)
-        raise CrawlError("get_page failed even after authenticating")
+        raise CrawlError(f"get_page failed even after authenticating on {url!r}")

    def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
        if IliasPage.is_root_page(soup) and not root_page_allowed:
--- a/PFERD/deduplicator.py
+++ b/PFERD/deduplicator.py
@@ -14,7 +14,7 @@ def name_variants(path: PurePath) -> Iterator[PurePath]:


 class Deduplicator:
-    FORBIDDEN_CHARS = '<>:"/\\|?*'
+    FORBIDDEN_CHARS = '<>:"/\\|?*' + "".join([chr(i) for i in range(0, 32)])
    FORBIDDEN_NAMES = {
        "CON", "PRN", "AUX", "NUL",
        "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
--- a/PFERD/output_dir.py
+++ b/PFERD/output_dir.py
@@ -415,6 +415,7 @@ class OutputDirectory:

    def _update_metadata(self, info: DownloadInfo) -> None:
        if mtime := info.heuristics.mtime:
+            log.explain(f"Setting mtime to {mtime}")
            mtimestamp = mtime.timestamp()
            os.utime(info.local_path, times=(mtimestamp, mtimestamp))

--- a/PFERD/version.py
+++ b/PFERD/version.py
@@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.4.3"
+VERSION = "3.5.0"
--- a/flake.lock
+++ b/flake.lock
@@ -2,11 +2,11 @@
  "nodes": {
    "nixpkgs": {
      "locked": {
-        "lastModified": 1692986144,
-        "narHash": "sha256-M4VFpy7Av9j+33HF5nIGm0k2+DXXW4qSSKdidIKg5jY=",
+        "lastModified": 1694499547,
+        "narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "74e5bdc5478ebbe7ba5849f0d765f92757bb9dbf",
+        "rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
        "type": "github"
      },
      "original": {
Author	SHA1	Message	Date
I-Al-Istannen	03efa17cf1	Print mtime before updating file metadata	2023-09-23 13:01:58 +02:00
I-Al-Istannen	533bc27439	Bump version to 3.5.0	2023-09-13 23:13:30 +02:00
I-Al-Istannen	0113a0ca10	Update flake.lock	2023-09-13 22:23:36 +02:00
I-Al-Istannen	40f8a05ad6	Add .idea to gitignore	2023-09-13 22:23:36 +02:00
I-Al-Istannen	50b50513c6	Ignore SCORM learning modules	2023-08-29 13:51:19 +02:00
I-Al-Istannen	df3514cd03	Crawl paginated past meetings	2023-08-29 12:41:21 +02:00
I-Al-Istannen	ad53185247	Sanitize ascii control characters on windows	2023-08-29 12:41:15 +02:00
I-Al-Istannen	87b67e9271	Crawl files in the info tab	2023-08-29 12:41:15 +02:00