mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-07-12 14:12:30 +02:00
Download only new/updated forum threads
This commit is contained in:
@ -294,6 +294,35 @@ class Crawler(ABC):
|
|||||||
log.explain("Answer: Yes")
|
log.explain("Answer: Yes")
|
||||||
return CrawlToken(self._limiter, path)
|
return CrawlToken(self._limiter, path)
|
||||||
|
|
||||||
|
def should_try_download(
|
||||||
|
self,
|
||||||
|
path: PurePath,
|
||||||
|
*,
|
||||||
|
etag_differs: Optional[bool] = None,
|
||||||
|
mtime: Optional[datetime] = None,
|
||||||
|
redownload: Optional[Redownload] = None,
|
||||||
|
on_conflict: Optional[OnConflict] = None,
|
||||||
|
) -> bool:
|
||||||
|
log.explain_topic(f"Decision: Should Download {fmt_path(path)}")
|
||||||
|
|
||||||
|
if self._transformer.transform(path) is None:
|
||||||
|
log.explain("Answer: No (ignored)")
|
||||||
|
return False
|
||||||
|
|
||||||
|
should_download = self._output_dir.should_try_download(
|
||||||
|
path,
|
||||||
|
etag_differs=etag_differs,
|
||||||
|
mtime=mtime,
|
||||||
|
redownload=redownload,
|
||||||
|
on_conflict=on_conflict
|
||||||
|
)
|
||||||
|
if should_download:
|
||||||
|
log.explain("Answer: Yes")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
log.explain("Answer: No")
|
||||||
|
return False
|
||||||
|
|
||||||
async def download(
|
async def download(
|
||||||
self,
|
self,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
|
@ -723,20 +723,52 @@ instance's greatest bottleneck.
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
download_data = cast(IliasPage, page).get_download_forum_data()
|
forum_threads: list[tuple[IliasPageElement, bool]] = []
|
||||||
if not download_data:
|
for entry in cast(IliasPage, page).get_forum_entries():
|
||||||
raise CrawlWarning("Failed to extract forum data")
|
path = cl.path / (_sanitize_path_name(entry.name) + ".html")
|
||||||
if download_data.empty:
|
forum_threads.append((entry, self.should_try_download(path, mtime=entry.mtime)))
|
||||||
|
|
||||||
|
# Sort the ids. The forum download will *preserve* this ordering
|
||||||
|
forum_threads.sort(key=lambda elem: elem[0].id())
|
||||||
|
|
||||||
|
if not forum_threads:
|
||||||
log.explain("Forum had no threads")
|
log.explain("Forum had no threads")
|
||||||
return
|
return
|
||||||
html = await self._post_authenticated(download_data.url, download_data.form_data)
|
|
||||||
elements = parse_ilias_forum_export(soupify(html))
|
|
||||||
|
|
||||||
elements.sort(key=lambda elem: elem.title)
|
download_data = cast(IliasPage, page).get_download_forum_data(
|
||||||
|
[thread.id() for thread, download in forum_threads if download]
|
||||||
|
)
|
||||||
|
if not download_data:
|
||||||
|
raise CrawlWarning("Failed to extract forum data")
|
||||||
|
|
||||||
|
if not download_data.empty:
|
||||||
|
html = await self._post_authenticated(download_data.url, download_data.form_data)
|
||||||
|
elements = parse_ilias_forum_export(soupify(html))
|
||||||
|
else:
|
||||||
|
elements = []
|
||||||
|
|
||||||
|
# Verify that ILIAS does not change the order, as we depend on it later. Otherwise, we could not call
|
||||||
|
# download in the correct order, potentially messing up duplication handling.
|
||||||
|
expected_element_titles = [thread.name for thread, download in forum_threads if download]
|
||||||
|
actual_element_titles = [_sanitize_path_name(thread.name) for thread in elements]
|
||||||
|
if expected_element_titles != actual_element_titles:
|
||||||
|
raise CrawlWarning(
|
||||||
|
f"Forum thread order mismatch: {expected_element_titles} != {actual_element_titles}"
|
||||||
|
)
|
||||||
|
|
||||||
tasks: List[Awaitable[None]] = []
|
tasks: List[Awaitable[None]] = []
|
||||||
for elem in elements:
|
for thread, download in forum_threads:
|
||||||
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
|
if download:
|
||||||
|
# This only works because ILIAS keeps the order in the export
|
||||||
|
elem = elements.pop(0)
|
||||||
|
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
|
||||||
|
else:
|
||||||
|
# We only downloaded the threads we "should_try_download"ed. This can be an
|
||||||
|
# over-approximation and all will be fine.
|
||||||
|
# If we selected too few, e.g. because there was a duplicate title and the mtime of the
|
||||||
|
# original is newer than the update of the duplicate.
|
||||||
|
# This causes stale data locally, but I consider this problem acceptable right now.
|
||||||
|
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread)))
|
||||||
|
|
||||||
# And execute them
|
# And execute them
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
@ -746,17 +778,17 @@ instance's greatest bottleneck.
|
|||||||
async def _download_forum_thread(
|
async def _download_forum_thread(
|
||||||
self,
|
self,
|
||||||
parent_path: PurePath,
|
parent_path: PurePath,
|
||||||
element: IliasForumThread,
|
element: Union[IliasForumThread, IliasPageElement]
|
||||||
) -> None:
|
) -> None:
|
||||||
path = parent_path / (_sanitize_path_name(element.title) + ".html")
|
path = parent_path / (_sanitize_path_name(element.name) + ".html")
|
||||||
maybe_dl = await self.download(path, mtime=element.mtime)
|
maybe_dl = await self.download(path, mtime=element.mtime)
|
||||||
if not maybe_dl:
|
if not maybe_dl or not isinstance(element, IliasForumThread):
|
||||||
return
|
return
|
||||||
|
|
||||||
async with maybe_dl as (bar, sink):
|
async with maybe_dl as (bar, sink):
|
||||||
content = "<!DOCTYPE html>\n"
|
content = "<!DOCTYPE html>\n"
|
||||||
content += cast(str, element.title_tag.prettify())
|
content += cast(str, element.name_tag.prettify())
|
||||||
content += cast(str, element.content_tag.prettify())
|
content += cast(str, await self.internalize_images(element.content_tag.prettify()))
|
||||||
sink.file.write(content.encode("utf-8"))
|
sink.file.write(content.encode("utf-8"))
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ class IliasElementType(Enum):
|
|||||||
FILE = "file"
|
FILE = "file"
|
||||||
FOLDER = "folder"
|
FOLDER = "folder"
|
||||||
FORUM = "forum"
|
FORUM = "forum"
|
||||||
|
FORUM_THREAD = "forum_thread"
|
||||||
INFO_TAB = "info_tab"
|
INFO_TAB = "info_tab"
|
||||||
LEARNING_MODULE = "learning_module"
|
LEARNING_MODULE = "learning_module"
|
||||||
LINK = "link"
|
LINK = "link"
|
||||||
@ -54,6 +55,7 @@ class IliasPageElement:
|
|||||||
r"fold_(?P<id>\d+)",
|
r"fold_(?P<id>\d+)",
|
||||||
r"frm_(?P<id>\d+)",
|
r"frm_(?P<id>\d+)",
|
||||||
r"exc_(?P<id>\d+)",
|
r"exc_(?P<id>\d+)",
|
||||||
|
r"thr_pk=(?P<id>\d+)", # forums
|
||||||
r"ref_id=(?P<id>\d+)",
|
r"ref_id=(?P<id>\d+)",
|
||||||
r"target=[a-z]+_(?P<id>\d+)",
|
r"target=[a-z]+_(?P<id>\d+)",
|
||||||
r"mm_(?P<id>\d+)"
|
r"mm_(?P<id>\d+)"
|
||||||
@ -123,8 +125,8 @@ class IliasDownloadForumData:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class IliasForumThread:
|
class IliasForumThread:
|
||||||
title: str
|
name: str
|
||||||
title_tag: Tag
|
name_tag: Tag
|
||||||
content_tag: Tag
|
content_tag: Tag
|
||||||
mtime: Optional[datetime]
|
mtime: Optional[datetime]
|
||||||
|
|
||||||
@ -242,7 +244,36 @@ class IliasPage:
|
|||||||
return url
|
return url
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
|
def get_forum_entries(self) -> list[IliasPageElement]:
|
||||||
|
form = self._get_forum_form()
|
||||||
|
if not form:
|
||||||
|
return []
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
for row in cast(list[Tag], form.select("table > tbody > tr")):
|
||||||
|
url_tag = cast(
|
||||||
|
Optional[Tag],
|
||||||
|
row.find(name="a", attrs={"href": lambda x: x is not None and "cmd=viewthread" in x.lower()})
|
||||||
|
)
|
||||||
|
if url_tag is None:
|
||||||
|
log.explain(f"Skipping row without URL: {row}")
|
||||||
|
continue
|
||||||
|
name = url_tag.get_text().strip()
|
||||||
|
columns = [td.get_text().strip() for td in cast(list[Tag], row.find_all(name="td"))]
|
||||||
|
potential_dates_opt = [IliasPage._find_date_in_text(column) for column in columns]
|
||||||
|
potential_dates = [x for x in potential_dates_opt if x is not None]
|
||||||
|
mtime = max(potential_dates) if potential_dates else None
|
||||||
|
|
||||||
|
threads.append(IliasPageElement.create_new(
|
||||||
|
IliasElementType.FORUM_THREAD,
|
||||||
|
self._abs_url_from_link(url_tag),
|
||||||
|
name,
|
||||||
|
mtime=mtime
|
||||||
|
))
|
||||||
|
|
||||||
|
return threads
|
||||||
|
|
||||||
|
def get_download_forum_data(self, thread_ids: list[str]) -> Optional[IliasDownloadForumData]:
|
||||||
form = cast(Optional[Tag], self._soup.find(
|
form = cast(Optional[Tag], self._soup.find(
|
||||||
"form",
|
"form",
|
||||||
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
|
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
|
||||||
@ -251,7 +282,7 @@ class IliasPage:
|
|||||||
return None
|
return None
|
||||||
post_url = self._abs_url_from_relative(cast(str, form["action"]))
|
post_url = self._abs_url_from_relative(cast(str, form["action"]))
|
||||||
|
|
||||||
thread_ids = [f["value"] for f in cast(list[Tag], form.find_all(attrs={"name": "thread_ids[]"}))]
|
log.explain(f"Fetching forum threads {thread_ids}")
|
||||||
|
|
||||||
form_data: Dict[str, Union[str, list[str]]] = {
|
form_data: Dict[str, Union[str, list[str]]] = {
|
||||||
"thread_ids[]": cast(list[str], thread_ids),
|
"thread_ids[]": cast(list[str], thread_ids),
|
||||||
@ -262,6 +293,12 @@ class IliasPage:
|
|||||||
|
|
||||||
return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0)
|
return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0)
|
||||||
|
|
||||||
|
def _get_forum_form(self) -> Optional[Tag]:
|
||||||
|
return cast(Optional[Tag], self._soup.find(
|
||||||
|
"form",
|
||||||
|
attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x}
|
||||||
|
))
|
||||||
|
|
||||||
def get_next_stage_element(self) -> Optional[IliasPageElement]:
|
def get_next_stage_element(self) -> Optional[IliasPageElement]:
|
||||||
if self._is_forum_page():
|
if self._is_forum_page():
|
||||||
if "trows=" in self._page_url:
|
if "trows=" in self._page_url:
|
||||||
@ -950,16 +987,9 @@ class IliasPage:
|
|||||||
# The rest does not have a stable order. Grab the whole text and reg-ex the date
|
# The rest does not have a stable order. Grab the whole text and reg-ex the date
|
||||||
# out of it
|
# out of it
|
||||||
all_properties_text = properties_parent.get_text().strip()
|
all_properties_text = properties_parent.get_text().strip()
|
||||||
modification_date_match = re.search(
|
modification_date = IliasPage._find_date_in_text(all_properties_text)
|
||||||
r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)",
|
if modification_date is None:
|
||||||
all_properties_text
|
|
||||||
)
|
|
||||||
if modification_date_match is None:
|
|
||||||
modification_date = None
|
|
||||||
log.explain(f"Element {name} at {url} has no date.")
|
log.explain(f"Element {name} at {url} has no date.")
|
||||||
else:
|
|
||||||
modification_date_str = modification_date_match.group(1)
|
|
||||||
modification_date = demangle_date(modification_date_str)
|
|
||||||
|
|
||||||
# Grab the name from the link text
|
# Grab the name from the link text
|
||||||
full_path = name + "." + file_type
|
full_path = name + "." + file_type
|
||||||
@ -1243,6 +1273,17 @@ class IliasPage:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_date_in_text(text: str) -> Optional[datetime]:
|
||||||
|
modification_date_match = re.search(
|
||||||
|
r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)",
|
||||||
|
text
|
||||||
|
)
|
||||||
|
if modification_date_match is not None:
|
||||||
|
modification_date_str = modification_date_match.group(1)
|
||||||
|
return demangle_date(modification_date_str)
|
||||||
|
return None
|
||||||
|
|
||||||
def get_permalink(self) -> Optional[str]:
|
def get_permalink(self) -> Optional[str]:
|
||||||
return IliasPage.get_soup_permalink(self._soup)
|
return IliasPage.get_soup_permalink(self._soup)
|
||||||
|
|
||||||
|
@ -371,6 +371,22 @@ class OutputDirectory:
|
|||||||
|
|
||||||
raise OutputDirError("Failed to create temporary file")
|
raise OutputDirError("Failed to create temporary file")
|
||||||
|
|
||||||
|
def should_try_download(
|
||||||
|
self,
|
||||||
|
path: PurePath,
|
||||||
|
*,
|
||||||
|
etag_differs: Optional[bool] = None,
|
||||||
|
mtime: Optional[datetime] = None,
|
||||||
|
redownload: Optional[Redownload] = None,
|
||||||
|
on_conflict: Optional[OnConflict] = None,
|
||||||
|
) -> bool:
|
||||||
|
heuristics = Heuristics(etag_differs, mtime)
|
||||||
|
redownload = self._redownload if redownload is None else redownload
|
||||||
|
on_conflict = self._on_conflict if on_conflict is None else on_conflict
|
||||||
|
local_path = self.resolve(path)
|
||||||
|
|
||||||
|
return self._should_download(local_path, heuristics, redownload, on_conflict)
|
||||||
|
|
||||||
async def download(
|
async def download(
|
||||||
self,
|
self,
|
||||||
remote_path: PurePath,
|
remote_path: PurePath,
|
||||||
|
Reference in New Issue
Block a user