Handle empty forums

This commit is contained in:
I-Al-Istannen 2022-10-24 13:09:29 +02:00
parent 4a51aaa4f5
commit d72fc2760b
3 changed files with 11 additions and 3 deletions

View File

@ -24,6 +24,7 @@ ambiguous situations.
### Fixed
- Forum crawling crashing when parsing empty (= 0 messages) threads
- Forum crawling crashing when a forum has no threads at all
## 3.4.1 - 2022-08-17

View File

@ -59,6 +59,7 @@ class IliasPageElement:
class IliasDownloadForumData:
url: str
form_data: Dict[str, Union[str, List[str]]]
empty: bool
@dataclass
@ -130,14 +131,16 @@ class IliasPage:
return None
post_url = self._abs_url_from_relative(form["action"])
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
form_data: Dict[str, Union[str, List[ſtr]]] = {
"thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})],
"thread_ids[]": thread_ids,
"selected_cmd2": "html",
"select_cmd2": "Ausführen",
"selected_cmd": "",
}
return IliasDownloadForumData(post_url, form_data)
return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0)
def get_next_stage_element(self) -> Optional[IliasPageElement]:
if self._is_forum_page():

View File

@ -658,7 +658,7 @@ instance's greatest bottleneck.
@_iorepeat(3, "crawling forum")
@anoncritical
async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements = []
elements: List[IliasForumThread] = []
async with cl:
next_stage_url = element.url
@ -677,6 +677,10 @@ instance's greatest bottleneck.
download_data = page.get_download_forum_data()
if not download_data:
raise CrawlWarning("Failed to extract forum data")
if download_data.empty:
log.explain("Forum had no threads")
elements = []
return
html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html))