From be175f9347ea73160839de643e089db328cf78df Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 Feb 2025 14:18:43 +0100 Subject: [PATCH] Download only new/updated forum threads --- PFERD/crawl/crawler.py | 29 +++++++++++ PFERD/crawl/ilias/ilias_web_crawler.py | 60 +++++++++++++++++------ PFERD/crawl/ilias/kit_ilias_html.py | 67 +++++++++++++++++++++----- PFERD/output_dir.py | 16 ++++++ 4 files changed, 145 insertions(+), 27 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index fda1307..74616e0 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -294,6 +294,35 @@ class Crawler(ABC): log.explain("Answer: Yes") return CrawlToken(self._limiter, path) + def should_try_download( + self, + path: PurePath, + *, + etag_differs: Optional[bool] = None, + mtime: Optional[datetime] = None, + redownload: Optional[Redownload] = None, + on_conflict: Optional[OnConflict] = None, + ) -> bool: + log.explain_topic(f"Decision: Should Download {fmt_path(path)}") + + if self._transformer.transform(path) is None: + log.explain("Answer: No (ignored)") + return False + + should_download = self._output_dir.should_try_download( + path, + etag_differs=etag_differs, + mtime=mtime, + redownload=redownload, + on_conflict=on_conflict + ) + if should_download: + log.explain("Answer: Yes") + return True + else: + log.explain("Answer: No") + return False + async def download( self, path: PurePath, diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 557150c..7351593 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -723,20 +723,52 @@ instance's greatest bottleneck. else: break - download_data = cast(IliasPage, page).get_download_forum_data() - if not download_data: - raise CrawlWarning("Failed to extract forum data") - if download_data.empty: + forum_threads: list[tuple[IliasPageElement, bool]] = [] + for entry in cast(IliasPage, page).get_forum_entries(): + path = cl.path / (_sanitize_path_name(entry.name) + ".html") + forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime))) + + # Sort the ids. The forum download will *preserve* this ordering + forum_threads.sort(key=lambda elem: elem[0].id()) + + if not forum_threads: log.explain("Forum had no threads") return - html = await self._post_authenticated(download_data.url, download_data.form_data) - elements = parse_ilias_forum_export(soupify(html)) - elements.sort(key=lambda elem: elem.title) + download_data = cast(IliasPage, page).get_download_forum_data( + [thread.id() for thread, download in forum_threads if download] + ) + if not download_data: + raise CrawlWarning("Failed to extract forum data") + + if not download_data.empty: + html = await self._post_authenticated(download_data.url, download_data.form_data) + elements = parse_ilias_forum_export(soupify(html)) + else: + elements = [] + + # Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call + # download in the correct order, potentially messing up duplication handling. + expected_element_titles = [thread.name for thread, download in forum_threads if download] + actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements] + if expected_element_titles != actual_element_titles: + raise CrawlWarning( + f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}" + ) tasks: List[Awaitable[None]] = [] - for elem in elements: - tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) + for thread, download in forum_threads: + if download: + # This only works because ILIAS keeps the order in the export + elem = elements.pop(0) + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) + else: + # We only downloaded the threads we "should_try_download"ed. This can be an + # over-approximation and all will be fine. + # If we selected too few, e.g. because there was a duplicate title and the mtime of the + # original is newer than the update of the duplicate. + # This causes stale data locally, but I consider this problem acceptable right now. + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread))) # And execute them await self.gather(tasks) @@ -746,17 +778,17 @@ instance's greatest bottleneck. async def _download_forum_thread( self, parent_path: PurePath, - element: IliasForumThread, + element: Union[IliasForumThread, IliasPageElement] ) -> None: - path = parent_path / (_sanitize_path_name(element.title) + ".html") + path = parent_path / (_sanitize_path_name(element.name) + ".html") maybe_dl = await self.download(path, mtime=element.mtime) - if not maybe_dl: + if not maybe_dl or not isinstance(element, IliasForumThread): return async with maybe_dl as (bar, sink): content = "\n" - content += cast(str, element.title_tag.prettify()) - content += cast(str, element.content_tag.prettify()) + content += cast(str, element.name_tag.prettify()) + content += cast(str, await self.internalize_images(element.content_tag.prettify())) sink.file.write(content.encode("utf-8")) sink.done() diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index a194856..7956b00 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FILE = "file" FOLDER = "folder" FORUM = "forum" + FORUM_THREAD = "forum_thread" INFO_TAB = "info_tab" LEARNING_MODULE = "learning_module" LINK = "link" @@ -54,6 +55,7 @@ class IliasPageElement: r"fold_(?P\d+)", r"frm_(?P\d+)", r"exc_(?P\d+)", + r"thr_pk=(?P\d+)", # forums r"ref_id=(?P\d+)", r"target=[a-z]+_(?P\d+)", r"mm_(?P\d+)" @@ -123,8 +125,8 @@ class IliasDownloadForumData: @dataclass class IliasForumThread: - title: str - title_tag: Tag + name: str + name_tag: Tag content_tag: Tag mtime: Optional[datetime] @@ -242,7 +244,36 @@ class IliasPage: return url return None - def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: + def get_forum_entries(self) -> list[IliasPageElement]: + form = self._get_forum_form() + if not form: + return [] + threads = [] + + for row in cast(list[Tag], form.select("table > tbody > tr")): + url_tag = cast( + Optional[Tag], + row.find(name="a", attrs={"href": lambda x: x is not None and "cmd=viewthread" in x.lower()}) + ) + if url_tag is None: + log.explain(f"Skipping row without URL: {row}") + continue + name = url_tag.get_text().strip() + columns = [td.get_text().strip() for td in cast(list[Tag], row.find_all(name="td"))] + potential_dates_opt = [IliasPage._find_date_in_text(column) for column in columns] + potential_dates = [x for x in potential_dates_opt if x is not None] + mtime = max(potential_dates) if potential_dates else None + + threads.append(IliasPageElement.create_new( + IliasElementType.FORUM_THREAD, + self._abs_url_from_link(url_tag), + name, + mtime=mtime + )) + + return threads + + def get_download_forum_data(self, thread_ids: list[str]) -> Optional[IliasDownloadForumData]: form = cast(Optional[Tag], self._soup.find( "form", attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x} @@ -251,7 +282,7 @@ class IliasPage: return None post_url = self._abs_url_from_relative(cast(str, form["action"])) - thread_ids = [f["value"] for f in cast(list[Tag], form.find_all(attrs={"name": "thread_ids[]"}))] + log.explain(f"Fetching forum threads {thread_ids}") form_data: Dict[str, Union[str, list[str]]] = { "thread_ids[]": cast(list[str], thread_ids), @@ -262,6 +293,12 @@ class IliasPage: return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) + def _get_forum_form(self) -> Optional[Tag]: + return cast(Optional[Tag], self._soup.find( + "form", + attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x} + )) + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_forum_page(): if "trows=" in self._page_url: @@ -950,16 +987,9 @@ class IliasPage: # The rest does not have a stable order. Grab the whole text and reg-ex the date # out of it all_properties_text = properties_parent.get_text().strip() - modification_date_match = re.search( - r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", - all_properties_text - ) - if modification_date_match is None: - modification_date = None + modification_date = IliasPage._find_date_in_text(all_properties_text) + if modification_date is None: log.explain(f"Element {name} at {url} has no date.") - else: - modification_date_str = modification_date_match.group(1) - modification_date = demangle_date(modification_date_str) # Grab the name from the link text full_path = name + "." + file_type @@ -1243,6 +1273,17 @@ class IliasPage: return True return False + @staticmethod + def _find_date_in_text(text: str) -> Optional[datetime]: + modification_date_match = re.search( + r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", + text + ) + if modification_date_match is not None: + modification_date_str = modification_date_match.group(1) + return demangle_date(modification_date_str) + return None + def get_permalink(self) -> Optional[str]: return IliasPage.get_soup_permalink(self._soup) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 09cf133..94337b6 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -371,6 +371,22 @@ class OutputDirectory: raise OutputDirError("Failed to create temporary file") + def should_try_download( + self, + path: PurePath, + *, + etag_differs: Optional[bool] = None, + mtime: Optional[datetime] = None, + redownload: Optional[Redownload] = None, + on_conflict: Optional[OnConflict] = None, + ) -> bool: + heuristics = Heuristics(etag_differs, mtime) + redownload = self._redownload if redownload is None else redownload + on_conflict = self._on_conflict if on_conflict is None else on_conflict + local_path = self.resolve(path) + + return self._should_download(local_path, heuristics, redownload, on_conflict) + async def download( self, remote_path: PurePath,