From be175f9347ea73160839de643e089db328cf78df Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Wed, 19 Feb 2025 14:18:43 +0100
Subject: [PATCH] Download only new/updated forum threads

---
 PFERD/crawl/crawler.py                 | 29 +++++++++++
 PFERD/crawl/ilias/ilias_web_crawler.py | 60 +++++++++++++++++------
 PFERD/crawl/ilias/kit_ilias_html.py    | 67 +++++++++++++++++++++-----
 PFERD/output_dir.py                    | 16 ++++++
 4 files changed, 145 insertions(+), 27 deletions(-)

diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py
index fda1307..74616e0 100644
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@@ -294,6 +294,35 @@ class Crawler(ABC):
         log.explain("Answer: Yes")
         return CrawlToken(self._limiter, path)
 
+    def should_try_download(
+            self,
+            path: PurePath,
+            *,
+            etag_differs: Optional[bool] = None,
+            mtime: Optional[datetime] = None,
+            redownload: Optional[Redownload] = None,
+            on_conflict: Optional[OnConflict] = None,
+    ) -> bool:
+        log.explain_topic(f"Decision: Should Download {fmt_path(path)}")
+
+        if self._transformer.transform(path) is None:
+            log.explain("Answer: No (ignored)")
+            return False
+
+        should_download = self._output_dir.should_try_download(
+            path,
+            etag_differs=etag_differs,
+            mtime=mtime,
+            redownload=redownload,
+            on_conflict=on_conflict
+        )
+        if should_download:
+            log.explain("Answer: Yes")
+            return True
+        else:
+            log.explain("Answer: No")
+            return False
+
     async def download(
             self,
             path: PurePath,
diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py
index 557150c..7351593 100644
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -723,20 +723,52 @@ instance's greatest bottleneck.
                 else:
                     break
 
-            download_data = cast(IliasPage, page).get_download_forum_data()
-            if not download_data:
-                raise CrawlWarning("Failed to extract forum data")
-            if download_data.empty:
+            forum_threads: list[tuple[IliasPageElement, bool]] = []
+            for entry in cast(IliasPage, page).get_forum_entries():
+                path = cl.path / (_sanitize_path_name(entry.name) + ".html")
+                forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
+
+            # Sort the ids. The forum download will *preserve* this ordering
+            forum_threads.sort(key=lambda elem: elem[0].id())
+
+            if not forum_threads:
                 log.explain("Forum had no threads")
                 return
-            html = await self._post_authenticated(download_data.url, download_data.form_data)
-            elements = parse_ilias_forum_export(soupify(html))
 
-        elements.sort(key=lambda elem: elem.title)
+            download_data = cast(IliasPage, page).get_download_forum_data(
+                [thread.id() for thread, download in forum_threads if download]
+            )
+            if not download_data:
+                raise CrawlWarning("Failed to extract forum data")
+
+            if not download_data.empty:
+                html = await self._post_authenticated(download_data.url, download_data.form_data)
+                elements = parse_ilias_forum_export(soupify(html))
+            else:
+                elements = []
+
+        # Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
+        # download in the correct order, potentially messing up duplication handling.
+        expected_element_titles = [thread.name for thread, download in forum_threads if download]
+        actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
+        if expected_element_titles != actual_element_titles:
+            raise CrawlWarning(
+                f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
+            )
 
         tasks: List[Awaitable[None]] = []
-        for elem in elements:
-            tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
+        for thread, download in forum_threads:
+            if download:
+                # This only works because ILIAS keeps the order in the export
+                elem = elements.pop(0)
+                tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
+            else:
+                # We only downloaded the threads we "should_try_download"ed. This can be an
+                # over-approximation and all will be fine.
+                # If we selected too few, e.g. because there was a duplicate title and the mtime of the
+                # original is newer than the update of the duplicate.
+                # This causes stale data locally, but I consider this problem acceptable right now.
+                tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread)))
 
         # And execute them
         await self.gather(tasks)
@@ -746,17 +778,17 @@ instance's greatest bottleneck.
     async def _download_forum_thread(
         self,
         parent_path: PurePath,
-        element: IliasForumThread,
+        element: Union[IliasForumThread, IliasPageElement]
     ) -> None:
-        path = parent_path / (_sanitize_path_name(element.title) + ".html")
+        path = parent_path / (_sanitize_path_name(element.name) + ".html")
         maybe_dl = await self.download(path, mtime=element.mtime)
-        if not maybe_dl:
+        if not maybe_dl or not isinstance(element, IliasForumThread):
             return
 
         async with maybe_dl as (bar, sink):
             content = "<!DOCTYPE html>\n"
-            content += cast(str, element.title_tag.prettify())
-            content += cast(str, element.content_tag.prettify())
+            content += cast(str, element.name_tag.prettify())
+            content += cast(str, await self.internalize_images(element.content_tag.prettify()))
             sink.file.write(content.encode("utf-8"))
             sink.done()
 
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index a194856..7956b00 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -22,6 +22,7 @@ class IliasElementType(Enum):
     FILE = "file"
     FOLDER = "folder"
     FORUM = "forum"
+    FORUM_THREAD = "forum_thread"
     INFO_TAB = "info_tab"
     LEARNING_MODULE = "learning_module"
     LINK = "link"
@@ -54,6 +55,7 @@ class IliasPageElement:
             r"fold_(?P<id>\d+)",
             r"frm_(?P<id>\d+)",
             r"exc_(?P<id>\d+)",
+            r"thr_pk=(?P<id>\d+)",  # forums
             r"ref_id=(?P<id>\d+)",
             r"target=[a-z]+_(?P<id>\d+)",
             r"mm_(?P<id>\d+)"
@@ -123,8 +125,8 @@ class IliasDownloadForumData:
 
 @dataclass
 class IliasForumThread:
-    title: str
-    title_tag: Tag
+    name: str
+    name_tag: Tag
     content_tag: Tag
     mtime: Optional[datetime]
 
@@ -242,7 +244,36 @@ class IliasPage:
             return url
         return None
 
-    def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
+    def get_forum_entries(self) -> list[IliasPageElement]:
+        form = self._get_forum_form()
+        if not form:
+            return []
+        threads = []
+
+        for row in cast(list[Tag], form.select("table > tbody > tr")):
+            url_tag = cast(
+                Optional[Tag],
+                row.find(name="a", attrs={"href": lambda x: x is not None and "cmd=viewthread" in x.lower()})
+            )
+            if url_tag is None:
+                log.explain(f"Skipping row without URL: {row}")
+                continue
+            name = url_tag.get_text().strip()
+            columns = [td.get_text().strip() for td in cast(list[Tag], row.find_all(name="td"))]
+            potential_dates_opt = [IliasPage._find_date_in_text(column) for column in columns]
+            potential_dates = [x for x in potential_dates_opt if x is not None]
+            mtime = max(potential_dates) if potential_dates else None
+
+            threads.append(IliasPageElement.create_new(
+                IliasElementType.FORUM_THREAD,
+                self._abs_url_from_link(url_tag),
+                name,
+                mtime=mtime
+            ))
+
+        return threads
+
+    def get_download_forum_data(self, thread_ids: list[str]) -> Optional[IliasDownloadForumData]:
         form = cast(Optional[Tag], self._soup.find(
             "form",
             attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
@@ -251,7 +282,7 @@ class IliasPage:
             return None
         post_url = self._abs_url_from_relative(cast(str, form["action"]))
 
-        thread_ids = [f["value"] for f in cast(list[Tag], form.find_all(attrs={"name": "thread_ids[]"}))]
+        log.explain(f"Fetching forum threads {thread_ids}")
 
         form_data: Dict[str, Union[str, list[str]]] = {
             "thread_ids[]": cast(list[str], thread_ids),
@@ -262,6 +293,12 @@ class IliasPage:
 
         return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0)
 
+    def _get_forum_form(self) -> Optional[Tag]:
+        return cast(Optional[Tag], self._soup.find(
+            "form",
+            attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
+        ))
+
     def get_next_stage_element(self) -> Optional[IliasPageElement]:
         if self._is_forum_page():
             if "trows=" in self._page_url:
@@ -950,16 +987,9 @@ class IliasPage:
         # The rest does not have a stable order. Grab the whole text and reg-ex the date
         # out of it
         all_properties_text = properties_parent.get_text().strip()
-        modification_date_match = re.search(
-            r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)",
-            all_properties_text
-        )
-        if modification_date_match is None:
-            modification_date = None
+        modification_date = IliasPage._find_date_in_text(all_properties_text)
+        if modification_date is None:
             log.explain(f"Element {name} at {url} has no date.")
-        else:
-            modification_date_str = modification_date_match.group(1)
-            modification_date = demangle_date(modification_date_str)
 
         # Grab the name from the link text
         full_path = name + "." + file_type
@@ -1243,6 +1273,17 @@ class IliasPage:
             return True
         return False
 
+    @staticmethod
+    def _find_date_in_text(text: str) -> Optional[datetime]:
+        modification_date_match = re.search(
+            r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)",
+            text
+        )
+        if modification_date_match is not None:
+            modification_date_str = modification_date_match.group(1)
+            return demangle_date(modification_date_str)
+        return None
+
     def get_permalink(self) -> Optional[str]:
         return IliasPage.get_soup_permalink(self._soup)
 
diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py
index 09cf133..94337b6 100644
--- a/PFERD/output_dir.py
+++ b/PFERD/output_dir.py
@@ -371,6 +371,22 @@ class OutputDirectory:
 
         raise OutputDirError("Failed to create temporary file")
 
+    def should_try_download(
+        self,
+        path: PurePath,
+        *,
+        etag_differs: Optional[bool] = None,
+        mtime: Optional[datetime] = None,
+        redownload: Optional[Redownload] = None,
+        on_conflict: Optional[OnConflict] = None,
+    ) -> bool:
+        heuristics = Heuristics(etag_differs, mtime)
+        redownload = self._redownload if redownload is None else redownload
+        on_conflict = self._on_conflict if on_conflict is None else on_conflict
+        local_path = self.resolve(path)
+
+        return self._should_download(local_path, heuristics, redownload, on_conflict)
+
     async def download(
             self,
             remote_path: PurePath,