Add forum crawling

This downloads all forum posts when needed and saves each thread in its
own html file, named after the thread title.
This commit is contained in:
I-Al-Istannen 2022-05-24 23:28:09 +02:00
parent 846c29aee1
commit 46fb782798
5 changed files with 208 additions and 16 deletions

View File

@ -24,6 +24,7 @@ ambiguous situations.
### Added ### Added
- Download of page descriptions - Download of page descriptions
- Forum download support
### Changed ### Changed
- Add `cpp` extension to default `link_regex` of IPD crawler - Add `cpp` extension to default `link_regex` of IPD crawler

View File

@ -62,6 +62,11 @@ GROUP.add_argument(
action=BooleanOptionalAction, action=BooleanOptionalAction,
help="crawl and download videos" help="crawl and download videos"
) )
GROUP.add_argument(
"--forums",
action=BooleanOptionalAction,
help="crawl and download forum posts"
)
GROUP.add_argument( GROUP.add_argument(
"--http-timeout", "-t", "--http-timeout", "-t",
type=float, type=float,
@ -90,6 +95,8 @@ def load(
section["link_redirect_delay"] = str(args.link_redirect_delay) section["link_redirect_delay"] = str(args.link_redirect_delay)
if args.videos is not None: if args.videos is not None:
section["videos"] = "yes" if args.videos else "no" section["videos"] = "yes" if args.videos else "no"
if args.forums is not None:
section["forums"] = "yes" if args.forums else "no"
if args.http_timeout is not None: if args.http_timeout is not None:
section["http_timeout"] = str(args.http_timeout) section["http_timeout"] = str(args.http_timeout)

View File

@ -3,7 +3,7 @@ import re
from dataclasses import dataclass from dataclasses import dataclass
from datetime import date, datetime, timedelta from datetime import date, datetime, timedelta
from enum import Enum from enum import Enum
from typing import List, Optional, Union from typing import Dict, List, Optional, Union
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -55,6 +55,20 @@ class IliasPageElement:
return self.url return self.url
@dataclass
class IliasDownloadForumData:
url: str
form_data: Dict[str, Union[str, List[str]]]
@dataclass
class IliasForumThread:
title: str
title_tag: Tag
content_tag: Tag
mtime: Optional[datetime]
class IliasPage: class IliasPage:
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
@ -110,13 +124,39 @@ class IliasPage:
return BeautifulSoup(raw_html, "html.parser") return BeautifulSoup(raw_html, "html.parser")
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x})
if not form:
return None
post_url = self._abs_url_from_relative(form["action"])
form_data: Dict[str, Union[str, List[ſtr]]] = {
"thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})],
"selected_cmd2": "html",
"select_cmd2": "Ausführen",
"selected_cmd": "",
}
return IliasDownloadForumData(post_url, form_data)
def get_next_stage_element(self) -> Optional[IliasPageElement]: def get_next_stage_element(self) -> Optional[IliasPageElement]:
if self._is_forum_page():
if "trows=800" in self._page_url:
return None
return self._get_show_max_forum_entries_per_page_url()
if self._is_ilias_opencast_embedding(): if self._is_ilias_opencast_embedding():
return self.get_child_elements()[0] return self.get_child_elements()[0]
if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED:
return self._find_video_entries_paginated()[0] return self._find_video_entries_paginated()[0]
return None return None
def _is_forum_page(self) -> bool:
read_more_btn = self._soup.find(
"button",
attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x}
)
return read_more_btn is not None
def _is_video_player(self) -> bool: def _is_video_player(self) -> bool:
return "paella_config_file" in str(self._soup) return "paella_config_file" in str(self._soup)
@ -194,6 +234,19 @@ class IliasPage:
return items return items
def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]:
correct_link = self._soup.find(
"a",
attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x}
)
if not correct_link:
return None
link = self._abs_url_from_link(correct_link)
return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads")
def _find_personal_desktop_entries(self) -> List[IliasPageElement]: def _find_personal_desktop_entries(self) -> List[IliasPageElement]:
items: List[IliasPageElement] = [] items: List[IliasPageElement] = []
@ -877,3 +930,38 @@ def _tomorrow() -> date:
def _sanitize_path_name(name: str) -> str: def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip() return name.replace("/", "-").replace("\\", "-").strip()
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]:
elements = []
for p in forum_export.select("body > p"):
title_tag = p
content_tag = p.find_next_sibling("ul")
title = p.find("b").text
if ":" in title:
title = title[title.find(":") + 1:]
title = title.strip()
mtime = _guess_timestamp_from_forum_post_content(content_tag)
elements.append(IliasForumThread(title, title_tag, content_tag, mtime))
return elements
def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]:
posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small")
if not posts:
return None
newest_date: Optional[datetime] = None
for post in posts:
text = post.text.strip()
text = text[text.rfind("|") + 1:]
date = demangle_date(text, fail_silently=True)
if not date:
continue
if not newest_date or newest_date < date:
newest_date = date
return newest_date

View File

@ -18,7 +18,8 @@ from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadTo
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import Links from .file_templates import Links
from .ilias_html_cleaner import clean, insert_base_markup from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement,
_sanitize_path_name, parse_ilias_forum_export)
TargetType = Union[str, int] TargetType = Union[str, int]
@ -67,6 +68,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
def videos(self) -> bool: def videos(self) -> bool:
return self.s.getboolean("videos", fallback=False) return self.s.getboolean("videos", fallback=False)
def forums(self) -> bool:
return self.s.getboolean("forums", fallback=False)
_DIRECTORY_PAGES: Set[IliasElementType] = set([ _DIRECTORY_PAGES: Set[IliasElementType] = set([
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
@ -183,6 +187,7 @@ instance's greatest bottleneck.
self._link_file_redirect_delay = section.link_redirect_delay() self._link_file_redirect_delay = section.link_redirect_delay()
self._links = section.links() self._links = section.links()
self._videos = section.videos() self._videos = section.videos()
self._forums = section.forums()
self._visited_urls: Set[str] = set() self._visited_urls: Set[str] = set()
async def _run(self) -> None: async def _run(self) -> None:
@ -335,22 +340,27 @@ instance's greatest bottleneck.
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
if element.type in _VIDEO_ELEMENTS: if element.type in _VIDEO_ELEMENTS:
log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}")
if not self._videos: if not self._videos:
log.explain("Video crawling is disabled") log.status(
log.explain("Answer: no") "[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](enable with option 'videos')"
)
return None return None
else:
log.explain("Video crawling is enabled")
log.explain("Answer: yes")
if element.type == IliasElementType.FILE: if element.type == IliasElementType.FILE:
return await self._handle_file(element, element_path) return await self._handle_file(element, element_path)
elif element.type == IliasElementType.FORUM: elif element.type == IliasElementType.FORUM:
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") if not self._forums:
log.explain("Forums are not supported") log.status(
log.explain("Answer: No") "[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](enable with option 'forums')"
)
return None return None
return await self._handle_forum(element, element_path)
elif element.type == IliasElementType.TEST: elif element.type == IliasElementType.TEST:
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
log.explain("Tests contain no relevant files") log.explain("Tests contain no relevant files")
@ -635,6 +645,68 @@ instance's greatest bottleneck.
if not await try_stream(): if not await try_stream():
raise CrawlError("File streaming failed after authenticate()") raise CrawlError("File streaming failed after authenticate()")
async def _handle_forum(
self,
element: IliasPageElement,
element_path: PurePath,
) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None
return self._crawl_forum(element, maybe_cl)
@_iorepeat(3, "crawling forum")
@anoncritical
async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
elements = []
async with cl:
next_stage_url = element.url
while next_stage_url:
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url)
page = IliasPage(soup, next_stage_url, None)
if next := page.get_next_stage_element():
next_stage_url = next.url
else:
break
download_data = page.get_download_forum_data()
if not download_data:
raise CrawlWarning("Failed to extract forum data")
html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html))
elements.sort(key=lambda elem: elem.title)
tasks: List[Awaitable[None]] = []
for elem in elements:
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
# And execute them
await self.gather(tasks)
@anoncritical
@_iorepeat(3, "saving forum thread")
async def _download_forum_thread(
self,
parent_path: PurePath,
element: IliasForumThread,
) -> None:
path = parent_path / (_sanitize_path_name(element.title) + ".html")
maybe_dl = await self.download(path, mtime=element.mtime)
if not maybe_dl:
return
async with maybe_dl as (bar, sink):
content = element.title_tag.prettify()
content += element.content_tag.prettify()
sink.file.write(content.encode("utf-8"))
sink.done()
async def _get_page(self, url: str) -> BeautifulSoup: async def _get_page(self, url: str) -> BeautifulSoup:
auth_id = await self._current_auth_id() auth_id = await self._current_auth_id()
async with self.session.get(url) as request: async with self.session.get(url) as request:
@ -652,13 +724,37 @@ instance's greatest bottleneck.
return soup return soup
raise CrawlError("get_page failed even after authenticating") raise CrawlError("get_page failed even after authenticating")
async def _post_authenticated(
self,
url: str,
data: dict[str, Union[str, List[str]]]
) -> BeautifulSoup:
auth_id = await self._current_auth_id()
form_data = aiohttp.FormData()
for key, val in data.items():
form_data.add_field(key, val)
async with self.session.post(url, data=form_data(), allow_redirects=False) as request:
if request.status == 200:
return await request.read()
# We weren't authenticated, so try to do that
await self.authenticate(auth_id)
# Retry once after authenticating. If this fails, we will die.
async with self.session.post(url, data=data, allow_redirects=False) as request:
if request.status == 200:
return await request.read()
raise CrawlError("post_authenticated failed even after authenticating")
# We repeat this as the login method in shibboleth doesn't handle I/O errors. # We repeat this as the login method in shibboleth doesn't handle I/O errors.
# Shibboleth is quite reliable as well, the repeat is likely not critical here. # Shibboleth is quite reliable as well, the repeat is likely not critical here.
@_iorepeat(3, "Login", failure_is_error=True) @ _iorepeat(3, "Login", failure_is_error=True)
async def _authenticate(self) -> None: async def _authenticate(self) -> None:
await self._shibboleth_login.login(self.session) await self._shibboleth_login.login(self.session)
@staticmethod @ staticmethod
def _is_logged_in(soup: BeautifulSoup) -> bool: def _is_logged_in(soup: BeautifulSoup) -> bool:
# Normal ILIAS pages # Normal ILIAS pages
mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")

View File

@ -197,7 +197,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
if self.output_explain: if self.output_explain:
self.print(f" {escape(text)}") self.print(f" {escape(text)}")
def status(self, style: str, action: str, text: str) -> None: def status(self, style: str, action: str, text: str, suffix: str = "") -> None:
""" """
Print a status update while crawling. Allows markup in the "style" Print a status update while crawling. Allows markup in the "style"
argument which will be applied to the "action" string. argument which will be applied to the "action" string.
@ -205,7 +205,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
if self.output_status: if self.output_status:
action = escape(f"{action:<{self.STATUS_WIDTH}}") action = escape(f"{action:<{self.STATUS_WIDTH}}")
self.print(f"{style}{action}[/] {escape(text)}") self.print(f"{style}{action}[/] {escape(text)} {suffix}")
def report(self, text: str) -> None: def report(self, text: str) -> None:
""" """