From 3f5637366e3c33af864663e559f4051ccfb5eb16 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 30 Oct 2025 20:19:30 +0100 Subject: [PATCH] Sanitize `/` in kit-ipd heading hierarchy --- CHANGELOG.md | 1 + PFERD/crawl/http_crawler.py | 4 +-- PFERD/crawl/ilias/ilias_web_crawler.py | 13 ++++----- PFERD/crawl/ilias/kit_ilias_html.py | 38 ++++++++++++-------------- PFERD/crawl/kit_ipd_crawler.py | 6 ++-- PFERD/utils.py | 4 +++ 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 729299e..e80f345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ ambiguous situations. ## Fixed - Event loop errors on Windows with Python 3.14 +- Sanitize `/` in headings in kit-ipd crawler ## 3.8.3 - 2025-07-01 diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 70ec5c1..49d6013 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -13,7 +13,7 @@ from bs4 import Tag from ..auth import Authenticator from ..config import Config from ..logging import log -from ..utils import fmt_real_path +from ..utils import fmt_real_path, sanitize_path_name from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection @@ -192,7 +192,7 @@ class HttpCrawler(Crawler): if level_heading is None: return find_associated_headings(tag, level - 1) - folder_name = level_heading.get_text().strip() + folder_name = sanitize_path_name(level_heading.get_text().strip()) return find_associated_headings(level_heading, level - 1) / folder_name # start at level

because paragraph-level headings are usually too granular for folder names diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 12d8700..fda9f6d 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -15,7 +15,7 @@ from ...auth import Authenticator from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload -from ...utils import fmt_path, soupify, url_set_query_param +from ...utils import fmt_path, sanitize_path_name, soupify, url_set_query_param from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .async_helper import _iorepeat @@ -28,7 +28,6 @@ from .kit_ilias_html import ( IliasPage, IliasPageElement, IliasSoup, - _sanitize_path_name, parse_ilias_forum_export, ) from .shibboleth_login import ShibbolethLogin @@ -505,7 +504,7 @@ instance's greatest bottleneck. async def download_all() -> None: for link in links: - path = cl.path / (_sanitize_path_name(link.name) + extension) + path = cl.path / (sanitize_path_name(link.name) + extension) if dl := await self.download(path, mtime=element.mtime): await self._download_link(self._links, element.name, [link], dl) @@ -843,7 +842,7 @@ instance's greatest bottleneck. async def _download_forum_thread( self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str ) -> None: - path = parent_path / (_sanitize_path_name(thread.name) + ".html") + path = parent_path / (sanitize_path_name(thread.name) + ".html") maybe_dl = await self.download(path, mtime=thread.mtime) if not maybe_dl or not isinstance(thread, IliasForumThread): return @@ -936,7 +935,7 @@ instance's greatest bottleneck. prev: Optional[str], next: Optional[str], ) -> None: - path = parent_path / (_sanitize_path_name(element.title) + ".html") + path = parent_path / (sanitize_path_name(element.title) + ".html") maybe_dl = await self.download(path) if not maybe_dl: return @@ -945,10 +944,10 @@ instance's greatest bottleneck. return if prev: - prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) + prev_p = self._transformer.transform(parent_path / (sanitize_path_name(prev) + ".html")) prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None if next: - next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) + next_p = self._transformer.transform(parent_path / (sanitize_path_name(next) + ".html")) next = os.path.relpath(next_p, my_path.parent) if next_p else None async with maybe_dl as (bar, sink): diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index db965b0..e23469c 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag from PFERD.crawl import CrawlError from PFERD.crawl.crawler import CrawlWarning from PFERD.logging import log -from PFERD.utils import url_set_query_params +from PFERD.utils import sanitize_path_name, url_set_query_params TargetType = str | int @@ -297,7 +297,7 @@ class IliasPageElement: name = normalized if not skip_sanitize: - name = _sanitize_path_name(name) + name = sanitize_path_name(name) return IliasPageElement(typ, url, name, mtime, description) @@ -695,7 +695,7 @@ class IliasPage: log.explain(f"Skipping offline item: {title.get_text().strip()!r}") continue - name = _sanitize_path_name(link.text.strip()) + name = sanitize_path_name(link.text.strip()) url = self._abs_url_from_link(link) if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url: @@ -723,7 +723,7 @@ class IliasPage: for link in links: url = self._abs_url_from_link(link) name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "") - name = _sanitize_path_name(name) + name = sanitize_path_name(name) if "file_id" not in url: _unexpected_html_warning() @@ -745,7 +745,7 @@ class IliasPage: continue items.append( IliasPageElement.create_new( - IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.get_text()) + IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text()) ) ) @@ -837,7 +837,7 @@ class IliasPage: title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip() title += ".mp4" - video_name: str = _sanitize_path_name(title) + video_name: str = sanitize_path_name(title) video_url = self._abs_url_from_link(link) @@ -893,7 +893,7 @@ class IliasPage: _unexpected_html_warning() continue - name = _sanitize_path_name(name_tag.get_text().strip()) + name = sanitize_path_name(name_tag.get_text().strip()) log.explain(f"Found exercise detail entry {name!r}") results.append( @@ -920,7 +920,7 @@ class IliasPage: parent_row: Tag = cast(Tag, link.find_parent("tr")) children = cast(list[Tag], parent_row.find_all("td")) - name = _sanitize_path_name(children[1].get_text().strip()) + name = sanitize_path_name(children[1].get_text().strip()) log.explain(f"Found exercise file entry {name!r}") date = None @@ -957,7 +957,7 @@ class IliasPage: if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower(): continue - name = _sanitize_path_name(exercise.get_text().strip()) + name = sanitize_path_name(exercise.get_text().strip()) results.append( IliasPageElement.create_new( IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name @@ -983,12 +983,12 @@ class IliasPage: for link in links: abs_url = self._abs_url_from_link(link) # Make sure parents are sanitized. We do not want accidental parents - parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)] + parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)] if parents: - element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text()) + element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text()) else: - element_name = _sanitize_path_name(link.get_text()) + element_name = sanitize_path_name(link.get_text()) element_type = IliasPage._find_type_for_element( element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link) @@ -1053,7 +1053,7 @@ class IliasPage: IliasPageElement.create_new( typ=IliasElementType.MEDIACAST_VIDEO, url=self._abs_url_from_relative(cast(str, url)), - name=_sanitize_path_name(title), + name=sanitize_path_name(title), ) ) @@ -1081,7 +1081,7 @@ class IliasPage: videos.append( IliasPageElement.create_new( - typ=IliasElementType.MOB_VIDEO, url=url, name=_sanitize_path_name(title), mtime=None + typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None ) ) @@ -1192,7 +1192,7 @@ class IliasPage: ) found_titles.append(head_tag.get_text().strip()) - return [_sanitize_path_name(x) for x in reversed(found_titles)] + return [sanitize_path_name(x) for x in reversed(found_titles)] @staticmethod def _find_link_description(link: Tag) -> Optional[str]: @@ -1247,7 +1247,7 @@ class IliasPage: for title in card_titles: url = self._abs_url_from_link(title) - name = _sanitize_path_name(title.get_text().strip()) + name = sanitize_path_name(title.get_text().strip()) typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title)) if not typ: @@ -1274,7 +1274,7 @@ class IliasPage: log.warn_contd(f"Could not find click handler target for signal {signal} for {button}") continue url = self._abs_url_from_relative(open_match.group(1)) - name = _sanitize_path_name(button.get_text().strip()) + name = sanitize_path_name(button.get_text().strip()) typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button)) caption_parent = cast( Tag, @@ -1532,10 +1532,6 @@ def _tomorrow() -> date: return date.today() + timedelta(days=1) -def _sanitize_path_name(name: str) -> str: - return name.replace("/", "-").replace("\\", "-").strip() - - def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]: elements = [] for p in forum_export.select("body > p"): diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 4dad8f0..7094b9c 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -15,7 +15,7 @@ from ..auth import Authenticator from ..config import Config from ..logging import ProgressBar, log from ..output_dir import FileSink -from ..utils import soupify +from ..utils import sanitize_path_name, soupify from .crawler import CrawlError from .http_crawler import HttpCrawler, HttpCrawlerSection @@ -106,7 +106,7 @@ class KitIpdCrawler(HttpCrawler): await self.gather(tasks) async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None: - path = parent / folder.name + path = parent / sanitize_path_name(folder.name) if not await self.crawl(path): return @@ -125,7 +125,7 @@ class KitIpdCrawler(HttpCrawler): async def _download_file( self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime] ) -> None: - element_path = parent / file.name + element_path = parent / sanitize_path_name(file.name) prev_etag = self._get_previous_etag_from_report(element_path) etag_differs = None if prev_etag is None else prev_etag != etag diff --git a/PFERD/utils.py b/PFERD/utils.py index 918a9b6..1aa0585 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -106,6 +106,10 @@ def fmt_real_path(path: Path) -> str: return repr(str(path.absolute())) +def sanitize_path_name(name: str) -> str: + return name.replace("/", "-").replace("\\", "-").strip() + + class ReusableAsyncContextManager(ABC, Generic[T]): def __init__(self) -> None: self._active = False