diff --git a/CHANGELOG.md b/CHANGELOG.md
index 729299e..e80f345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,7 @@ ambiguous situations.
## Fixed
- Event loop errors on Windows with Python 3.14
+- Sanitize `/` in headings in kit-ipd crawler
## 3.8.3 - 2025-07-01
diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py
index 70ec5c1..49d6013 100644
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@@ -13,7 +13,7 @@ from bs4 import Tag
from ..auth import Authenticator
from ..config import Config
from ..logging import log
-from ..utils import fmt_real_path
+from ..utils import fmt_real_path, sanitize_path_name
from ..version import NAME, VERSION
from .crawler import Crawler, CrawlerSection
@@ -192,7 +192,7 @@ class HttpCrawler(Crawler):
if level_heading is None:
return find_associated_headings(tag, level - 1)
- folder_name = level_heading.get_text().strip()
+ folder_name = sanitize_path_name(level_heading.get_text().strip())
return find_associated_headings(level_heading, level - 1) / folder_name
# start at level
because paragraph-level headings are usually too granular for folder names
diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py
index 12d8700..fda9f6d 100644
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@@ -15,7 +15,7 @@ from ...auth import Authenticator
from ...config import Config
from ...logging import ProgressBar, log
from ...output_dir import FileSink, Redownload
-from ...utils import fmt_path, soupify, url_set_query_param
+from ...utils import fmt_path, sanitize_path_name, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat
@@ -28,7 +28,6 @@ from .kit_ilias_html import (
IliasPage,
IliasPageElement,
IliasSoup,
- _sanitize_path_name,
parse_ilias_forum_export,
)
from .shibboleth_login import ShibbolethLogin
@@ -505,7 +504,7 @@ instance's greatest bottleneck.
async def download_all() -> None:
for link in links:
- path = cl.path / (_sanitize_path_name(link.name) + extension)
+ path = cl.path / (sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl)
@@ -843,7 +842,7 @@ instance's greatest bottleneck.
async def _download_forum_thread(
self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
) -> None:
- path = parent_path / (_sanitize_path_name(thread.name) + ".html")
+ path = parent_path / (sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime)
if not maybe_dl or not isinstance(thread, IliasForumThread):
return
@@ -936,7 +935,7 @@ instance's greatest bottleneck.
prev: Optional[str],
next: Optional[str],
) -> None:
- path = parent_path / (_sanitize_path_name(element.title) + ".html")
+ path = parent_path / (sanitize_path_name(element.title) + ".html")
maybe_dl = await self.download(path)
if not maybe_dl:
return
@@ -945,10 +944,10 @@ instance's greatest bottleneck.
return
if prev:
- prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
+ prev_p = self._transformer.transform(parent_path / (sanitize_path_name(prev) + ".html"))
prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None
if next:
- next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
+ next_p = self._transformer.transform(parent_path / (sanitize_path_name(next) + ".html"))
next = os.path.relpath(next_p, my_path.parent) if next_p else None
async with maybe_dl as (bar, sink):
diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py
index db965b0..e23469c 100644
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
from PFERD.crawl import CrawlError
from PFERD.crawl.crawler import CrawlWarning
from PFERD.logging import log
-from PFERD.utils import url_set_query_params
+from PFERD.utils import sanitize_path_name, url_set_query_params
TargetType = str | int
@@ -297,7 +297,7 @@ class IliasPageElement:
name = normalized
if not skip_sanitize:
- name = _sanitize_path_name(name)
+ name = sanitize_path_name(name)
return IliasPageElement(typ, url, name, mtime, description)
@@ -695,7 +695,7 @@ class IliasPage:
log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
continue
- name = _sanitize_path_name(link.text.strip())
+ name = sanitize_path_name(link.text.strip())
url = self._abs_url_from_link(link)
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
@@ -723,7 +723,7 @@ class IliasPage:
for link in links:
url = self._abs_url_from_link(link)
name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
- name = _sanitize_path_name(name)
+ name = sanitize_path_name(name)
if "file_id" not in url:
_unexpected_html_warning()
@@ -745,7 +745,7 @@ class IliasPage:
continue
items.append(
IliasPageElement.create_new(
- IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.get_text())
+ IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text())
)
)
@@ -837,7 +837,7 @@ class IliasPage:
title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
title += ".mp4"
- video_name: str = _sanitize_path_name(title)
+ video_name: str = sanitize_path_name(title)
video_url = self._abs_url_from_link(link)
@@ -893,7 +893,7 @@ class IliasPage:
_unexpected_html_warning()
continue
- name = _sanitize_path_name(name_tag.get_text().strip())
+ name = sanitize_path_name(name_tag.get_text().strip())
log.explain(f"Found exercise detail entry {name!r}")
results.append(
@@ -920,7 +920,7 @@ class IliasPage:
parent_row: Tag = cast(Tag, link.find_parent("tr"))
children = cast(list[Tag], parent_row.find_all("td"))
- name = _sanitize_path_name(children[1].get_text().strip())
+ name = sanitize_path_name(children[1].get_text().strip())
log.explain(f"Found exercise file entry {name!r}")
date = None
@@ -957,7 +957,7 @@ class IliasPage:
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue
- name = _sanitize_path_name(exercise.get_text().strip())
+ name = sanitize_path_name(exercise.get_text().strip())
results.append(
IliasPageElement.create_new(
IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
@@ -983,12 +983,12 @@ class IliasPage:
for link in links:
abs_url = self._abs_url_from_link(link)
# Make sure parents are sanitized. We do not want accidental parents
- parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
+ parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
if parents:
- element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
+ element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text())
else:
- element_name = _sanitize_path_name(link.get_text())
+ element_name = sanitize_path_name(link.get_text())
element_type = IliasPage._find_type_for_element(
element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
@@ -1053,7 +1053,7 @@ class IliasPage:
IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(cast(str, url)),
- name=_sanitize_path_name(title),
+ name=sanitize_path_name(title),
)
)
@@ -1081,7 +1081,7 @@ class IliasPage:
videos.append(
IliasPageElement.create_new(
- typ=IliasElementType.MOB_VIDEO, url=url, name=_sanitize_path_name(title), mtime=None
+ typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None
)
)
@@ -1192,7 +1192,7 @@ class IliasPage:
)
found_titles.append(head_tag.get_text().strip())
- return [_sanitize_path_name(x) for x in reversed(found_titles)]
+ return [sanitize_path_name(x) for x in reversed(found_titles)]
@staticmethod
def _find_link_description(link: Tag) -> Optional[str]:
@@ -1247,7 +1247,7 @@ class IliasPage:
for title in card_titles:
url = self._abs_url_from_link(title)
- name = _sanitize_path_name(title.get_text().strip())
+ name = sanitize_path_name(title.get_text().strip())
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
if not typ:
@@ -1274,7 +1274,7 @@ class IliasPage:
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
continue
url = self._abs_url_from_relative(open_match.group(1))
- name = _sanitize_path_name(button.get_text().strip())
+ name = sanitize_path_name(button.get_text().strip())
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
caption_parent = cast(
Tag,
@@ -1532,10 +1532,6 @@ def _tomorrow() -> date:
return date.today() + timedelta(days=1)
-def _sanitize_path_name(name: str) -> str:
- return name.replace("/", "-").replace("\\", "-").strip()
-
-
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
elements = []
for p in forum_export.select("body > p"):
diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py
index 4dad8f0..7094b9c 100644
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -15,7 +15,7 @@ from ..auth import Authenticator
from ..config import Config
from ..logging import ProgressBar, log
from ..output_dir import FileSink
-from ..utils import soupify
+from ..utils import sanitize_path_name, soupify
from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection
@@ -106,7 +106,7 @@ class KitIpdCrawler(HttpCrawler):
await self.gather(tasks)
async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None:
- path = parent / folder.name
+ path = parent / sanitize_path_name(folder.name)
if not await self.crawl(path):
return
@@ -125,7 +125,7 @@ class KitIpdCrawler(HttpCrawler):
async def _download_file(
self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime]
) -> None:
- element_path = parent / file.name
+ element_path = parent / sanitize_path_name(file.name)
prev_etag = self._get_previous_etag_from_report(element_path)
etag_differs = None if prev_etag is None else prev_etag != etag
diff --git a/PFERD/utils.py b/PFERD/utils.py
index 918a9b6..1aa0585 100644
--- a/PFERD/utils.py
+++ b/PFERD/utils.py
@@ -106,6 +106,10 @@ def fmt_real_path(path: Path) -> str:
return repr(str(path.absolute()))
+def sanitize_path_name(name: str) -> str:
+ return name.replace("/", "-").replace("\\", "-").strip()
+
+
class ReusableAsyncContextManager(ABC, Generic[T]):
def __init__(self) -> None:
self._active = False