From 3f5637366e3c33af864663e559f4051ccfb5eb16 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Thu, 30 Oct 2025 20:19:30 +0100
Subject: [PATCH] Sanitize `/` in kit-ipd heading hierarchy

---
 CHANGELOG.md                           |  1 +
 PFERD/crawl/http_crawler.py            |  4 +--
 PFERD/crawl/ilias/ilias_web_crawler.py | 13 ++++-----
 PFERD/crawl/ilias/kit_ilias_html.py    | 38 ++++++++++++--------------
 PFERD/crawl/kit_ipd_crawler.py         |  6 ++--
 PFERD/utils.py                         |  4 +++
 6 files changed, 33 insertions(+), 33 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 729299e..e80f345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,7 @@ ambiguous situations.
 
 ## Fixed
 - Event loop errors on Windows with Python 3.14
+- Sanitize `/` in headings in kit-ipd crawler
 
 ## 3.8.3 - 2025-07-01
 
diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py
index 70ec5c1..49d6013 100644
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@@ -13,7 +13,7 @@ from bs4 import Tag
 from ..auth import Authenticator
 from ..config import Config
 from ..logging import log
-from ..utils import fmt_real_path
+from ..utils import fmt_real_path, sanitize_path_name
 from ..version import NAME, VERSION
 from .crawler import Crawler, CrawlerSection
 
@@ -192,7 +192,7 @@ class HttpCrawler(Crawler):
             if level_heading is None:
                 return find_associated_headings(tag, level - 1)
 
-            folder_name = level_heading.get_text().strip()
+            folder_name = sanitize_path_name(level_heading.get_text().strip())
             return find_associated_headings(level_heading, level - 1) / folder_name
 
         # start at level <h3> because paragraph-level headings are usually too granular for folder names
diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py
index 12d8700..fda9f6d 100644
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -15,7 +15,7 @@ from ...auth import Authenticator
 from ...config import Config
 from ...logging import ProgressBar, log
 from ...output_dir import FileSink, Redownload
-from ...utils import fmt_path, soupify, url_set_query_param
+from ...utils import fmt_path, sanitize_path_name, soupify, url_set_query_param
 from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .async_helper import _iorepeat
@@ -28,7 +28,6 @@ from .kit_ilias_html import (
     IliasPage,
     IliasPageElement,
     IliasSoup,
-    _sanitize_path_name,
     parse_ilias_forum_export,
 )
 from .shibboleth_login import ShibbolethLogin
@@ -505,7 +504,7 @@ instance's greatest bottleneck.
 
         async def download_all() -> None:
             for link in links:
-                path = cl.path / (_sanitize_path_name(link.name) + extension)
+                path = cl.path / (sanitize_path_name(link.name) + extension)
                 if dl := await self.download(path, mtime=element.mtime):
                     await self._download_link(self._links, element.name, [link], dl)
 
@@ -843,7 +842,7 @@ instance's greatest bottleneck.
     async def _download_forum_thread(
         self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
     ) -> None:
-        path = parent_path / (_sanitize_path_name(thread.name) + ".html")
+        path = parent_path / (sanitize_path_name(thread.name) + ".html")
         maybe_dl = await self.download(path, mtime=thread.mtime)
         if not maybe_dl or not isinstance(thread, IliasForumThread):
             return
@@ -936,7 +935,7 @@ instance's greatest bottleneck.
         prev: Optional[str],
         next: Optional[str],
     ) -> None:
-        path = parent_path / (_sanitize_path_name(element.title) + ".html")
+        path = parent_path / (sanitize_path_name(element.title) + ".html")
         maybe_dl = await self.download(path)
         if not maybe_dl:
             return
@@ -945,10 +944,10 @@ instance's greatest bottleneck.
             return
 
         if prev:
-            prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
+            prev_p = self._transformer.transform(parent_path / (sanitize_path_name(prev) + ".html"))
             prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None
         if next:
-            next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
+            next_p = self._transformer.transform(parent_path / (sanitize_path_name(next) + ".html"))
             next = os.path.relpath(next_p, my_path.parent) if next_p else None
 
         async with maybe_dl as (bar, sink):
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index db965b0..e23469c 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
 from PFERD.crawl import CrawlError
 from PFERD.crawl.crawler import CrawlWarning
 from PFERD.logging import log
-from PFERD.utils import url_set_query_params
+from PFERD.utils import sanitize_path_name, url_set_query_params
 
 TargetType = str | int
 
@@ -297,7 +297,7 @@ class IliasPageElement:
             name = normalized
 
         if not skip_sanitize:
-            name = _sanitize_path_name(name)
+            name = sanitize_path_name(name)
 
         return IliasPageElement(typ, url, name, mtime, description)
 
@@ -695,7 +695,7 @@ class IliasPage:
                 log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
                 continue
 
-            name = _sanitize_path_name(link.text.strip())
+            name = sanitize_path_name(link.text.strip())
             url = self._abs_url_from_link(link)
 
             if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
@@ -723,7 +723,7 @@ class IliasPage:
         for link in links:
             url = self._abs_url_from_link(link)
             name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
-            name = _sanitize_path_name(name)
+            name = sanitize_path_name(name)
 
             if "file_id" not in url:
                 _unexpected_html_warning()
@@ -745,7 +745,7 @@ class IliasPage:
                 continue
             items.append(
                 IliasPageElement.create_new(
-                    IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.get_text())
+                    IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text())
                 )
             )
 
@@ -837,7 +837,7 @@ class IliasPage:
         title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
         title += ".mp4"
 
-        video_name: str = _sanitize_path_name(title)
+        video_name: str = sanitize_path_name(title)
 
         video_url = self._abs_url_from_link(link)
 
@@ -893,7 +893,7 @@ class IliasPage:
                 _unexpected_html_warning()
                 continue
 
-            name = _sanitize_path_name(name_tag.get_text().strip())
+            name = sanitize_path_name(name_tag.get_text().strip())
             log.explain(f"Found exercise detail entry {name!r}")
 
             results.append(
@@ -920,7 +920,7 @@ class IliasPage:
             parent_row: Tag = cast(Tag, link.find_parent("tr"))
             children = cast(list[Tag], parent_row.find_all("td"))
 
-            name = _sanitize_path_name(children[1].get_text().strip())
+            name = sanitize_path_name(children[1].get_text().strip())
             log.explain(f"Found exercise file entry {name!r}")
 
             date = None
@@ -957,7 +957,7 @@ class IliasPage:
             if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
                 continue
 
-            name = _sanitize_path_name(exercise.get_text().strip())
+            name = sanitize_path_name(exercise.get_text().strip())
             results.append(
                 IliasPageElement.create_new(
                     IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
@@ -983,12 +983,12 @@ class IliasPage:
         for link in links:
             abs_url = self._abs_url_from_link(link)
             # Make sure parents are sanitized. We do not want accidental parents
-            parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
+            parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
 
             if parents:
-                element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
+                element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text())
             else:
-                element_name = _sanitize_path_name(link.get_text())
+                element_name = sanitize_path_name(link.get_text())
 
             element_type = IliasPage._find_type_for_element(
                 element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
@@ -1053,7 +1053,7 @@ class IliasPage:
                         IliasPageElement.create_new(
                             typ=IliasElementType.MEDIACAST_VIDEO,
                             url=self._abs_url_from_relative(cast(str, url)),
-                            name=_sanitize_path_name(title),
+                            name=sanitize_path_name(title),
                         )
                     )
 
@@ -1081,7 +1081,7 @@ class IliasPage:
 
             videos.append(
                 IliasPageElement.create_new(
-                    typ=IliasElementType.MOB_VIDEO, url=url, name=_sanitize_path_name(title), mtime=None
+                    typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None
                 )
             )
 
@@ -1192,7 +1192,7 @@ class IliasPage:
             )
             found_titles.append(head_tag.get_text().strip())
 
-        return [_sanitize_path_name(x) for x in reversed(found_titles)]
+        return [sanitize_path_name(x) for x in reversed(found_titles)]
 
     @staticmethod
     def _find_link_description(link: Tag) -> Optional[str]:
@@ -1247,7 +1247,7 @@ class IliasPage:
 
         for title in card_titles:
             url = self._abs_url_from_link(title)
-            name = _sanitize_path_name(title.get_text().strip())
+            name = sanitize_path_name(title.get_text().strip())
             typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
 
             if not typ:
@@ -1274,7 +1274,7 @@ class IliasPage:
                 log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
                 continue
             url = self._abs_url_from_relative(open_match.group(1))
-            name = _sanitize_path_name(button.get_text().strip())
+            name = sanitize_path_name(button.get_text().strip())
             typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
             caption_parent = cast(
                 Tag,
@@ -1532,10 +1532,6 @@ def _tomorrow() -> date:
     return date.today() + timedelta(days=1)
 
 
-def _sanitize_path_name(name: str) -> str:
-    return name.replace("/", "-").replace("\\", "-").strip()
-
-
 def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
     elements = []
     for p in forum_export.select("body > p"):
diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py
index 4dad8f0..7094b9c 100644
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -15,7 +15,7 @@ from ..auth import Authenticator
 from ..config import Config
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink
-from ..utils import soupify
+from ..utils import sanitize_path_name, soupify
 from .crawler import CrawlError
 from .http_crawler import HttpCrawler, HttpCrawlerSection
 
@@ -106,7 +106,7 @@ class KitIpdCrawler(HttpCrawler):
         await self.gather(tasks)
 
     async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None:
-        path = parent / folder.name
+        path = parent / sanitize_path_name(folder.name)
         if not await self.crawl(path):
             return
 
@@ -125,7 +125,7 @@ class KitIpdCrawler(HttpCrawler):
     async def _download_file(
         self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime]
     ) -> None:
-        element_path = parent / file.name
+        element_path = parent / sanitize_path_name(file.name)
 
         prev_etag = self._get_previous_etag_from_report(element_path)
         etag_differs = None if prev_etag is None else prev_etag != etag
diff --git a/PFERD/utils.py b/PFERD/utils.py
index 918a9b6..1aa0585 100644
--- a/PFERD/utils.py
+++ b/PFERD/utils.py
@@ -106,6 +106,10 @@ def fmt_real_path(path: Path) -> str:
     return repr(str(path.absolute()))
 
 
+def sanitize_path_name(name: str) -> str:
+    return name.replace("/", "-").replace("\\", "-").strip()
+
+
 class ReusableAsyncContextManager(ABC, Generic[T]):
     def __init__(self) -> None:
         self._active = False