Sanitize / in kit-ipd heading hierarchy

This commit is contained in:
I-Al-Istannen
2025-10-30 20:19:30 +01:00
parent 3453bbc991
commit 3f5637366e
6 changed files with 33 additions and 33 deletions

View File

@@ -28,6 +28,7 @@ ambiguous situations.
## Fixed ## Fixed
- Event loop errors on Windows with Python 3.14 - Event loop errors on Windows with Python 3.14
- Sanitize `/` in headings in kit-ipd crawler
## 3.8.3 - 2025-07-01 ## 3.8.3 - 2025-07-01

View File

@@ -13,7 +13,7 @@ from bs4 import Tag
from ..auth import Authenticator from ..auth import Authenticator
from ..config import Config from ..config import Config
from ..logging import log from ..logging import log
from ..utils import fmt_real_path from ..utils import fmt_real_path, sanitize_path_name
from ..version import NAME, VERSION from ..version import NAME, VERSION
from .crawler import Crawler, CrawlerSection from .crawler import Crawler, CrawlerSection
@@ -192,7 +192,7 @@ class HttpCrawler(Crawler):
if level_heading is None: if level_heading is None:
return find_associated_headings(tag, level - 1) return find_associated_headings(tag, level - 1)
folder_name = level_heading.get_text().strip() folder_name = sanitize_path_name(level_heading.get_text().strip())
return find_associated_headings(level_heading, level - 1) / folder_name return find_associated_headings(level_heading, level - 1) / folder_name
# start at level <h3> because paragraph-level headings are usually too granular for folder names # start at level <h3> because paragraph-level headings are usually too granular for folder names

View File

@@ -15,7 +15,7 @@ from ...auth import Authenticator
from ...config import Config from ...config import Config
from ...logging import ProgressBar, log from ...logging import ProgressBar, log
from ...output_dir import FileSink, Redownload from ...output_dir import FileSink, Redownload
from ...utils import fmt_path, soupify, url_set_query_param from ...utils import fmt_path, sanitize_path_name, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .async_helper import _iorepeat from .async_helper import _iorepeat
@@ -28,7 +28,6 @@ from .kit_ilias_html import (
IliasPage, IliasPage,
IliasPageElement, IliasPageElement,
IliasSoup, IliasSoup,
_sanitize_path_name,
parse_ilias_forum_export, parse_ilias_forum_export,
) )
from .shibboleth_login import ShibbolethLogin from .shibboleth_login import ShibbolethLogin
@@ -505,7 +504,7 @@ instance's greatest bottleneck.
async def download_all() -> None: async def download_all() -> None:
for link in links: for link in links:
path = cl.path / (_sanitize_path_name(link.name) + extension) path = cl.path / (sanitize_path_name(link.name) + extension)
if dl := await self.download(path, mtime=element.mtime): if dl := await self.download(path, mtime=element.mtime):
await self._download_link(self._links, element.name, [link], dl) await self._download_link(self._links, element.name, [link], dl)
@@ -843,7 +842,7 @@ instance's greatest bottleneck.
async def _download_forum_thread( async def _download_forum_thread(
self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
) -> None: ) -> None:
path = parent_path / (_sanitize_path_name(thread.name) + ".html") path = parent_path / (sanitize_path_name(thread.name) + ".html")
maybe_dl = await self.download(path, mtime=thread.mtime) maybe_dl = await self.download(path, mtime=thread.mtime)
if not maybe_dl or not isinstance(thread, IliasForumThread): if not maybe_dl or not isinstance(thread, IliasForumThread):
return return
@@ -936,7 +935,7 @@ instance's greatest bottleneck.
prev: Optional[str], prev: Optional[str],
next: Optional[str], next: Optional[str],
) -> None: ) -> None:
path = parent_path / (_sanitize_path_name(element.title) + ".html") path = parent_path / (sanitize_path_name(element.title) + ".html")
maybe_dl = await self.download(path) maybe_dl = await self.download(path)
if not maybe_dl: if not maybe_dl:
return return
@@ -945,10 +944,10 @@ instance's greatest bottleneck.
return return
if prev: if prev:
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) prev_p = self._transformer.transform(parent_path / (sanitize_path_name(prev) + ".html"))
prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None
if next: if next:
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) next_p = self._transformer.transform(parent_path / (sanitize_path_name(next) + ".html"))
next = os.path.relpath(next_p, my_path.parent) if next_p else None next = os.path.relpath(next_p, my_path.parent) if next_p else None
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):

View File

@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
from PFERD.crawl import CrawlError from PFERD.crawl import CrawlError
from PFERD.crawl.crawler import CrawlWarning from PFERD.crawl.crawler import CrawlWarning
from PFERD.logging import log from PFERD.logging import log
from PFERD.utils import url_set_query_params from PFERD.utils import sanitize_path_name, url_set_query_params
TargetType = str | int TargetType = str | int
@@ -297,7 +297,7 @@ class IliasPageElement:
name = normalized name = normalized
if not skip_sanitize: if not skip_sanitize:
name = _sanitize_path_name(name) name = sanitize_path_name(name)
return IliasPageElement(typ, url, name, mtime, description) return IliasPageElement(typ, url, name, mtime, description)
@@ -695,7 +695,7 @@ class IliasPage:
log.explain(f"Skipping offline item: {title.get_text().strip()!r}") log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
continue continue
name = _sanitize_path_name(link.text.strip()) name = sanitize_path_name(link.text.strip())
url = self._abs_url_from_link(link) url = self._abs_url_from_link(link)
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url: if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
@@ -723,7 +723,7 @@ class IliasPage:
for link in links: for link in links:
url = self._abs_url_from_link(link) url = self._abs_url_from_link(link)
name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "") name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
name = _sanitize_path_name(name) name = sanitize_path_name(name)
if "file_id" not in url: if "file_id" not in url:
_unexpected_html_warning() _unexpected_html_warning()
@@ -745,7 +745,7 @@ class IliasPage:
continue continue
items.append( items.append(
IliasPageElement.create_new( IliasPageElement.create_new(
IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.get_text()) IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text())
) )
) )
@@ -837,7 +837,7 @@ class IliasPage:
title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip() title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
title += ".mp4" title += ".mp4"
video_name: str = _sanitize_path_name(title) video_name: str = sanitize_path_name(title)
video_url = self._abs_url_from_link(link) video_url = self._abs_url_from_link(link)
@@ -893,7 +893,7 @@ class IliasPage:
_unexpected_html_warning() _unexpected_html_warning()
continue continue
name = _sanitize_path_name(name_tag.get_text().strip()) name = sanitize_path_name(name_tag.get_text().strip())
log.explain(f"Found exercise detail entry {name!r}") log.explain(f"Found exercise detail entry {name!r}")
results.append( results.append(
@@ -920,7 +920,7 @@ class IliasPage:
parent_row: Tag = cast(Tag, link.find_parent("tr")) parent_row: Tag = cast(Tag, link.find_parent("tr"))
children = cast(list[Tag], parent_row.find_all("td")) children = cast(list[Tag], parent_row.find_all("td"))
name = _sanitize_path_name(children[1].get_text().strip()) name = sanitize_path_name(children[1].get_text().strip())
log.explain(f"Found exercise file entry {name!r}") log.explain(f"Found exercise file entry {name!r}")
date = None date = None
@@ -957,7 +957,7 @@ class IliasPage:
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower(): if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
continue continue
name = _sanitize_path_name(exercise.get_text().strip()) name = sanitize_path_name(exercise.get_text().strip())
results.append( results.append(
IliasPageElement.create_new( IliasPageElement.create_new(
IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
@@ -983,12 +983,12 @@ class IliasPage:
for link in links: for link in links:
abs_url = self._abs_url_from_link(link) abs_url = self._abs_url_from_link(link)
# Make sure parents are sanitized. We do not want accidental parents # Make sure parents are sanitized. We do not want accidental parents
parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)] parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
if parents: if parents:
element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text()) element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text())
else: else:
element_name = _sanitize_path_name(link.get_text()) element_name = sanitize_path_name(link.get_text())
element_type = IliasPage._find_type_for_element( element_type = IliasPage._find_type_for_element(
element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link) element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
@@ -1053,7 +1053,7 @@ class IliasPage:
IliasPageElement.create_new( IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO, typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(cast(str, url)), url=self._abs_url_from_relative(cast(str, url)),
name=_sanitize_path_name(title), name=sanitize_path_name(title),
) )
) )
@@ -1081,7 +1081,7 @@ class IliasPage:
videos.append( videos.append(
IliasPageElement.create_new( IliasPageElement.create_new(
typ=IliasElementType.MOB_VIDEO, url=url, name=_sanitize_path_name(title), mtime=None typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None
) )
) )
@@ -1192,7 +1192,7 @@ class IliasPage:
) )
found_titles.append(head_tag.get_text().strip()) found_titles.append(head_tag.get_text().strip())
return [_sanitize_path_name(x) for x in reversed(found_titles)] return [sanitize_path_name(x) for x in reversed(found_titles)]
@staticmethod @staticmethod
def _find_link_description(link: Tag) -> Optional[str]: def _find_link_description(link: Tag) -> Optional[str]:
@@ -1247,7 +1247,7 @@ class IliasPage:
for title in card_titles: for title in card_titles:
url = self._abs_url_from_link(title) url = self._abs_url_from_link(title)
name = _sanitize_path_name(title.get_text().strip()) name = sanitize_path_name(title.get_text().strip())
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title)) typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
if not typ: if not typ:
@@ -1274,7 +1274,7 @@ class IliasPage:
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}") log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
continue continue
url = self._abs_url_from_relative(open_match.group(1)) url = self._abs_url_from_relative(open_match.group(1))
name = _sanitize_path_name(button.get_text().strip()) name = sanitize_path_name(button.get_text().strip())
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button)) typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
caption_parent = cast( caption_parent = cast(
Tag, Tag,
@@ -1532,10 +1532,6 @@ def _tomorrow() -> date:
return date.today() + timedelta(days=1) return date.today() + timedelta(days=1)
def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip()
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]: def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
elements = [] elements = []
for p in forum_export.select("body > p"): for p in forum_export.select("body > p"):

View File

@@ -15,7 +15,7 @@ from ..auth import Authenticator
from ..config import Config from ..config import Config
from ..logging import ProgressBar, log from ..logging import ProgressBar, log
from ..output_dir import FileSink from ..output_dir import FileSink
from ..utils import soupify from ..utils import sanitize_path_name, soupify
from .crawler import CrawlError from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection from .http_crawler import HttpCrawler, HttpCrawlerSection
@@ -106,7 +106,7 @@ class KitIpdCrawler(HttpCrawler):
await self.gather(tasks) await self.gather(tasks)
async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None: async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None:
path = parent / folder.name path = parent / sanitize_path_name(folder.name)
if not await self.crawl(path): if not await self.crawl(path):
return return
@@ -125,7 +125,7 @@ class KitIpdCrawler(HttpCrawler):
async def _download_file( async def _download_file(
self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime] self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime]
) -> None: ) -> None:
element_path = parent / file.name element_path = parent / sanitize_path_name(file.name)
prev_etag = self._get_previous_etag_from_report(element_path) prev_etag = self._get_previous_etag_from_report(element_path)
etag_differs = None if prev_etag is None else prev_etag != etag etag_differs = None if prev_etag is None else prev_etag != etag

View File

@@ -106,6 +106,10 @@ def fmt_real_path(path: Path) -> str:
return repr(str(path.absolute())) return repr(str(path.absolute()))
def sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip()
class ReusableAsyncContextManager(ABC, Generic[T]): class ReusableAsyncContextManager(ABC, Generic[T]):
def __init__(self) -> None: def __init__(self) -> None:
self._active = False self._active = False