Sanitize / in kit-ipd heading hierarchy

This commit is contained in:
I-Al-Istannen
2025-10-30 20:19:30 +01:00
parent 3453bbc991
commit 3f5637366e
6 changed files with 33 additions and 33 deletions

View File

@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
from PFERD.crawl import CrawlError
from PFERD.crawl.crawler import CrawlWarning
from PFERD.logging import log
from PFERD.utils import url_set_query_params
from PFERD.utils import sanitize_path_name, url_set_query_params
TargetType = str | int
@@ -297,7 +297,7 @@ class IliasPageElement:
name = normalized
if not skip_sanitize:
name = _sanitize_path_name(name)
name = sanitize_path_name(name)
return IliasPageElement(typ, url, name, mtime, description)
@@ -695,7 +695,7 @@ class IliasPage:
log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
continue
name = _sanitize_path_name(link.text.strip())
name = sanitize_path_name(link.text.strip())
url = self._abs_url_from_link(link)
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
@@ -723,7 +723,7 @@ class IliasPage:
for link in links:
url = self._abs_url_from_link(link)
name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
name = _sanitize_path_name(name)
name = sanitize_path_name(name)
if "file_id" not in url:
_unexpected_html_warning()
@@ -745,7 +745,7 @@ class IliasPage:
continue
items.append(
IliasPageElement.create_new(
IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.get_text())
IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text())
)
)
@@ -837,7 +837,7 @@ class IliasPage:
title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
title += ".mp4"
video_name: str = _sanitize_path_name(title)
video_name: str = sanitize_path_name(title)
video_url = self._abs_url_from_link(link)
@@ -893,7 +893,7 @@ class IliasPage:
_unexpected_html_warning()
continue
name = _sanitize_path_name(name_tag.get_text().strip())
name = sanitize_path_name(name_tag.get_text().strip())
log.explain(f"Found exercise detail entry {name!r}")
results.append(
@@ -920,7 +920,7 @@ class IliasPage:
parent_row: Tag = cast(Tag, link.find_parent("tr"))
children = cast(list[Tag], parent_row.find_all("td"))
name = _sanitize_path_name(children[1].get_text().strip())
name = sanitize_path_name(children[1].get_text().strip())
log.explain(f"Found exercise file entry {name!r}")
date = None
@@ -957,7 +957,7 @@ class IliasPage:
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue
name = _sanitize_path_name(exercise.get_text().strip())
name = sanitize_path_name(exercise.get_text().strip())
results.append(
IliasPageElement.create_new(
IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
@@ -983,12 +983,12 @@ class IliasPage:
for link in links:
abs_url = self._abs_url_from_link(link)
# Make sure parents are sanitized. We do not want accidental parents
parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
if parents:
element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text())
else:
element_name = _sanitize_path_name(link.get_text())
element_name = sanitize_path_name(link.get_text())
element_type = IliasPage._find_type_for_element(
element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
@@ -1053,7 +1053,7 @@ class IliasPage:
IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(cast(str, url)),
name=_sanitize_path_name(title),
name=sanitize_path_name(title),
)
)
@@ -1081,7 +1081,7 @@ class IliasPage:
videos.append(
IliasPageElement.create_new(
typ=IliasElementType.MOB_VIDEO, url=url, name=_sanitize_path_name(title), mtime=None
typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None
)
)
@@ -1192,7 +1192,7 @@ class IliasPage:
)
found_titles.append(head_tag.get_text().strip())
return [_sanitize_path_name(x) for x in reversed(found_titles)]
return [sanitize_path_name(x) for x in reversed(found_titles)]
@staticmethod
def _find_link_description(link: Tag) -> Optional[str]:
@@ -1247,7 +1247,7 @@ class IliasPage:
for title in card_titles:
url = self._abs_url_from_link(title)
name = _sanitize_path_name(title.get_text().strip())
name = sanitize_path_name(title.get_text().strip())
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
if not typ:
@@ -1274,7 +1274,7 @@ class IliasPage:
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
continue
url = self._abs_url_from_relative(open_match.group(1))
name = _sanitize_path_name(button.get_text().strip())
name = sanitize_path_name(button.get_text().strip())
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
caption_parent = cast(
Tag,
@@ -1532,10 +1532,6 @@ def _tomorrow() -> date:
return date.today() + timedelta(days=1)
def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip()
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
elements = []
for p in forum_export.select("body > p"):