From d72fc2760b1dd8243ccf21876bb8cc6e027944bb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:09:29 +0200 Subject: [PATCH] Handle empty forums --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70d2cd5..c7a9899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a forum has no threads at all ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 8795512..9ea6b9f 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -59,6 +59,7 @@ class IliasPageElement: class IliasDownloadForumData: url: str form_data: Dict[str, Union[str, List[str]]] + empty: bool @dataclass @@ -130,14 +131,16 @@ class IliasPage: return None post_url = self._abs_url_from_relative(form["action"]) + thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] + form_data: Dict[str, Union[str, List[ſtr]]] = { - "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", "selected_cmd": "", } - return IliasDownloadForumData(post_url, form_data) + return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_forum_page(): diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 1852c5f..f2d5215 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -658,7 +658,7 @@ instance's greatest bottleneck. @_iorepeat(3, "crawling forum") @anoncritical async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: - elements = [] + elements: List[IliasForumThread] = [] async with cl: next_stage_url = element.url @@ -677,6 +677,10 @@ instance's greatest bottleneck. download_data = page.get_download_forum_data() if not download_data: raise CrawlWarning("Failed to extract forum data") + if download_data.empty: + log.explain("Forum had no threads") + elements = [] + return html = await self._post_authenticated(download_data.url, download_data.form_data) elements = parse_ilias_forum_export(soupify(html))