diff --git a/CHANGELOG.md b/CHANGELOG.md index b7cad13..1d70c4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Added - Download of page descriptions +- Forum download support ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 12803a6..de74fc3 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -62,6 +62,11 @@ GROUP.add_argument( action=BooleanOptionalAction, help="crawl and download videos" ) +GROUP.add_argument( + "--forums", + action=BooleanOptionalAction, + help="crawl and download forum posts" +) GROUP.add_argument( "--http-timeout", "-t", type=float, @@ -90,6 +95,8 @@ def load( section["link_redirect_delay"] = str(args.link_redirect_delay) if args.videos is not None: section["videos"] = "yes" if args.videos else "no" + if args.forums is not None: + section["forums"] = "yes" if args.forums else "no" if args.http_timeout is not None: section["http_timeout"] = str(args.http_timeout) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d58e5c8..7bab152 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -55,6 +55,20 @@ class IliasPageElement: return self.url +@dataclass +class IliasDownloadForumData: + url: str + form_data: Dict[str, Union[str, List[str]]] + + +@dataclass +class IliasForumThread: + title: str + title_tag: Tag + content_tag: Tag + mtime: Optional[datetime] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -110,13 +124,39 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: + form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) + if not form: + return None + post_url = self._abs_url_from_relative(form["action"]) + + form_data: Dict[str, Union[str, List[ſtr]]] = { + "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "selected_cmd2": "html", + "select_cmd2": "Ausführen", + "selected_cmd": "", + } + + return IliasDownloadForumData(post_url, form_data) + def get_next_stage_element(self) -> Optional[IliasPageElement]: + if self._is_forum_page(): + if "trows=800" in self._page_url: + return None + return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: return self._find_video_entries_paginated()[0] return None + def _is_forum_page(self) -> bool: + read_more_btn = self._soup.find( + "button", + attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} + ) + return read_more_btn is not None + def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) @@ -194,6 +234,19 @@ class IliasPage: return items + def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: + correct_link = self._soup.find( + "a", + attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} + ) + + if not correct_link: + return None + + link = self._abs_url_from_link(correct_link) + + return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] @@ -877,3 +930,38 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() + + +def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]: + elements = [] + for p in forum_export.select("body > p"): + title_tag = p + content_tag = p.find_next_sibling("ul") + title = p.find("b").text + if ":" in title: + title = title[title.find(":") + 1:] + title = title.strip() + mtime = _guess_timestamp_from_forum_post_content(content_tag) + elements.append(IliasForumThread(title, title_tag, content_tag, mtime)) + + return elements + + +def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]: + posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small") + if not posts: + return None + + newest_date: Optional[datetime] = None + + for post in posts: + text = post.text.strip() + text = text[text.rfind("|") + 1:] + date = demangle_date(text, fail_silently=True) + if not date: + continue + + if not newest_date or newest_date < date: + newest_date = date + + return newest_date diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bbed986..156cd4c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -18,7 +18,8 @@ from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadTo from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, + _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -67,6 +68,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) + def forums(self) -> bool: + return self.s.getboolean("forums", fallback=False) + _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, @@ -183,6 +187,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._forums = section.forums() self._visited_urls: Set[str] = set() async def _run(self) -> None: @@ -335,22 +340,27 @@ instance's greatest bottleneck. element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: - log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") if not self._videos: - log.explain("Video crawling is disabled") - log.explain("Answer: no") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'videos')" + ) return None - else: - log.explain("Video crawling is enabled") - log.explain("Answer: yes") if element.type == IliasElementType.FILE: return await self._handle_file(element, element_path) elif element.type == IliasElementType.FORUM: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Forums are not supported") - log.explain("Answer: No") - return None + if not self._forums: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'forums')" + ) + return None + return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") @@ -635,6 +645,68 @@ instance's greatest bottleneck. if not await try_stream(): raise CrawlError("File streaming failed after authenticate()") + async def _handle_forum( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_forum(element, maybe_cl) + + @_iorepeat(3, "crawling forum") + @anoncritical + async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements = [] + + async with cl: + next_stage_url = element.url + while next_stage_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + + soup = await self._get_page(next_stage_url) + page = IliasPage(soup, next_stage_url, None) + + if next := page.get_next_stage_element(): + next_stage_url = next.url + else: + break + + download_data = page.get_download_forum_data() + if not download_data: + raise CrawlWarning("Failed to extract forum data") + html = await self._post_authenticated(download_data.url, download_data.form_data) + elements = parse_ilias_forum_export(soupify(html)) + + elements.sort(key=lambda elem: elem.title) + + tasks: List[Awaitable[None]] = [] + for elem in elements: + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) + + # And execute them + await self.gather(tasks) + + @anoncritical + @_iorepeat(3, "saving forum thread") + async def _download_forum_thread( + self, + parent_path: PurePath, + element: IliasForumThread, + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path, mtime=element.mtime) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + content = element.title_tag.prettify() + content += element.content_tag.prettify() + sink.file.write(content.encode("utf-8")) + sink.done() + async def _get_page(self, url: str) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -652,13 +724,37 @@ instance's greatest bottleneck. return soup raise CrawlError("get_page failed even after authenticating") + async def _post_authenticated( + self, + url: str, + data: dict[str, Union[str, List[str]]] + ) -> BeautifulSoup: + auth_id = await self._current_auth_id() + + form_data = aiohttp.FormData() + for key, val in data.items(): + form_data.add_field(key, val) + + async with self.session.post(url, data=form_data(), allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.post(url, data=data, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("post_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login", failure_is_error=True) + @ _iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) - @staticmethod + @ staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") diff --git a/PFERD/logging.py b/PFERD/logging.py index e833716..340b21f 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -197,7 +197,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_explain: self.print(f" {escape(text)}") - def status(self, style: str, action: str, text: str) -> None: + def status(self, style: str, action: str, text: str, suffix: str = "") -> None: """ Print a status update while crawling. Allows markup in the "style" argument which will be applied to the "action" string. @@ -205,7 +205,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_status: action = escape(f"{action:<{self.STATUS_WIDTH}}") - self.print(f"{style}{action}[/] {escape(text)}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") def report(self, text: str) -> None: """