mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-12-24 15:22:30 +01:00
Sanitize / in kit-ipd heading hierarchy
This commit is contained in:
@@ -28,6 +28,7 @@ ambiguous situations.
|
|||||||
|
|
||||||
## Fixed
|
## Fixed
|
||||||
- Event loop errors on Windows with Python 3.14
|
- Event loop errors on Windows with Python 3.14
|
||||||
|
- Sanitize `/` in headings in kit-ipd crawler
|
||||||
|
|
||||||
## 3.8.3 - 2025-07-01
|
## 3.8.3 - 2025-07-01
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from bs4 import Tag
|
|||||||
from ..auth import Authenticator
|
from ..auth import Authenticator
|
||||||
from ..config import Config
|
from ..config import Config
|
||||||
from ..logging import log
|
from ..logging import log
|
||||||
from ..utils import fmt_real_path
|
from ..utils import fmt_real_path, sanitize_path_name
|
||||||
from ..version import NAME, VERSION
|
from ..version import NAME, VERSION
|
||||||
from .crawler import Crawler, CrawlerSection
|
from .crawler import Crawler, CrawlerSection
|
||||||
|
|
||||||
@@ -192,7 +192,7 @@ class HttpCrawler(Crawler):
|
|||||||
if level_heading is None:
|
if level_heading is None:
|
||||||
return find_associated_headings(tag, level - 1)
|
return find_associated_headings(tag, level - 1)
|
||||||
|
|
||||||
folder_name = level_heading.get_text().strip()
|
folder_name = sanitize_path_name(level_heading.get_text().strip())
|
||||||
return find_associated_headings(level_heading, level - 1) / folder_name
|
return find_associated_headings(level_heading, level - 1) / folder_name
|
||||||
|
|
||||||
# start at level <h3> because paragraph-level headings are usually too granular for folder names
|
# start at level <h3> because paragraph-level headings are usually too granular for folder names
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from ...auth import Authenticator
|
|||||||
from ...config import Config
|
from ...config import Config
|
||||||
from ...logging import ProgressBar, log
|
from ...logging import ProgressBar, log
|
||||||
from ...output_dir import FileSink, Redownload
|
from ...output_dir import FileSink, Redownload
|
||||||
from ...utils import fmt_path, soupify, url_set_query_param
|
from ...utils import fmt_path, sanitize_path_name, soupify, url_set_query_param
|
||||||
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
from .async_helper import _iorepeat
|
from .async_helper import _iorepeat
|
||||||
@@ -28,7 +28,6 @@ from .kit_ilias_html import (
|
|||||||
IliasPage,
|
IliasPage,
|
||||||
IliasPageElement,
|
IliasPageElement,
|
||||||
IliasSoup,
|
IliasSoup,
|
||||||
_sanitize_path_name,
|
|
||||||
parse_ilias_forum_export,
|
parse_ilias_forum_export,
|
||||||
)
|
)
|
||||||
from .shibboleth_login import ShibbolethLogin
|
from .shibboleth_login import ShibbolethLogin
|
||||||
@@ -505,7 +504,7 @@ instance's greatest bottleneck.
|
|||||||
|
|
||||||
async def download_all() -> None:
|
async def download_all() -> None:
|
||||||
for link in links:
|
for link in links:
|
||||||
path = cl.path / (_sanitize_path_name(link.name) + extension)
|
path = cl.path / (sanitize_path_name(link.name) + extension)
|
||||||
if dl := await self.download(path, mtime=element.mtime):
|
if dl := await self.download(path, mtime=element.mtime):
|
||||||
await self._download_link(self._links, element.name, [link], dl)
|
await self._download_link(self._links, element.name, [link], dl)
|
||||||
|
|
||||||
@@ -843,7 +842,7 @@ instance's greatest bottleneck.
|
|||||||
async def _download_forum_thread(
|
async def _download_forum_thread(
|
||||||
self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
|
self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
|
||||||
) -> None:
|
) -> None:
|
||||||
path = parent_path / (_sanitize_path_name(thread.name) + ".html")
|
path = parent_path / (sanitize_path_name(thread.name) + ".html")
|
||||||
maybe_dl = await self.download(path, mtime=thread.mtime)
|
maybe_dl = await self.download(path, mtime=thread.mtime)
|
||||||
if not maybe_dl or not isinstance(thread, IliasForumThread):
|
if not maybe_dl or not isinstance(thread, IliasForumThread):
|
||||||
return
|
return
|
||||||
@@ -936,7 +935,7 @@ instance's greatest bottleneck.
|
|||||||
prev: Optional[str],
|
prev: Optional[str],
|
||||||
next: Optional[str],
|
next: Optional[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
path = parent_path / (_sanitize_path_name(element.title) + ".html")
|
path = parent_path / (sanitize_path_name(element.title) + ".html")
|
||||||
maybe_dl = await self.download(path)
|
maybe_dl = await self.download(path)
|
||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
return
|
return
|
||||||
@@ -945,10 +944,10 @@ instance's greatest bottleneck.
|
|||||||
return
|
return
|
||||||
|
|
||||||
if prev:
|
if prev:
|
||||||
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
|
prev_p = self._transformer.transform(parent_path / (sanitize_path_name(prev) + ".html"))
|
||||||
prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None
|
prev = os.path.relpath(prev_p, my_path.parent) if prev_p else None
|
||||||
if next:
|
if next:
|
||||||
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
|
next_p = self._transformer.transform(parent_path / (sanitize_path_name(next) + ".html"))
|
||||||
next = os.path.relpath(next_p, my_path.parent) if next_p else None
|
next = os.path.relpath(next_p, my_path.parent) if next_p else None
|
||||||
|
|
||||||
async with maybe_dl as (bar, sink):
|
async with maybe_dl as (bar, sink):
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
|
|||||||
from PFERD.crawl import CrawlError
|
from PFERD.crawl import CrawlError
|
||||||
from PFERD.crawl.crawler import CrawlWarning
|
from PFERD.crawl.crawler import CrawlWarning
|
||||||
from PFERD.logging import log
|
from PFERD.logging import log
|
||||||
from PFERD.utils import url_set_query_params
|
from PFERD.utils import sanitize_path_name, url_set_query_params
|
||||||
|
|
||||||
TargetType = str | int
|
TargetType = str | int
|
||||||
|
|
||||||
@@ -297,7 +297,7 @@ class IliasPageElement:
|
|||||||
name = normalized
|
name = normalized
|
||||||
|
|
||||||
if not skip_sanitize:
|
if not skip_sanitize:
|
||||||
name = _sanitize_path_name(name)
|
name = sanitize_path_name(name)
|
||||||
|
|
||||||
return IliasPageElement(typ, url, name, mtime, description)
|
return IliasPageElement(typ, url, name, mtime, description)
|
||||||
|
|
||||||
@@ -695,7 +695,7 @@ class IliasPage:
|
|||||||
log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
|
log.explain(f"Skipping offline item: {title.get_text().strip()!r}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
name = _sanitize_path_name(link.text.strip())
|
name = sanitize_path_name(link.text.strip())
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(link)
|
||||||
|
|
||||||
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
|
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
|
||||||
@@ -723,7 +723,7 @@ class IliasPage:
|
|||||||
for link in links:
|
for link in links:
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(link)
|
||||||
name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
|
name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "")
|
||||||
name = _sanitize_path_name(name)
|
name = sanitize_path_name(name)
|
||||||
|
|
||||||
if "file_id" not in url:
|
if "file_id" not in url:
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
@@ -745,7 +745,7 @@ class IliasPage:
|
|||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
IliasPageElement.create_new(
|
IliasPageElement.create_new(
|
||||||
IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.get_text())
|
IliasElementType.FILE, self._abs_url_from_link(link), sanitize_path_name(link.get_text())
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -837,7 +837,7 @@ class IliasPage:
|
|||||||
title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
|
title = cast(Tag, row.select_one("td.std:nth-child(3)")).get_text().strip()
|
||||||
title += ".mp4"
|
title += ".mp4"
|
||||||
|
|
||||||
video_name: str = _sanitize_path_name(title)
|
video_name: str = sanitize_path_name(title)
|
||||||
|
|
||||||
video_url = self._abs_url_from_link(link)
|
video_url = self._abs_url_from_link(link)
|
||||||
|
|
||||||
@@ -893,7 +893,7 @@ class IliasPage:
|
|||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
name = _sanitize_path_name(name_tag.get_text().strip())
|
name = sanitize_path_name(name_tag.get_text().strip())
|
||||||
log.explain(f"Found exercise detail entry {name!r}")
|
log.explain(f"Found exercise detail entry {name!r}")
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
@@ -920,7 +920,7 @@ class IliasPage:
|
|||||||
parent_row: Tag = cast(Tag, link.find_parent("tr"))
|
parent_row: Tag = cast(Tag, link.find_parent("tr"))
|
||||||
children = cast(list[Tag], parent_row.find_all("td"))
|
children = cast(list[Tag], parent_row.find_all("td"))
|
||||||
|
|
||||||
name = _sanitize_path_name(children[1].get_text().strip())
|
name = sanitize_path_name(children[1].get_text().strip())
|
||||||
log.explain(f"Found exercise file entry {name!r}")
|
log.explain(f"Found exercise file entry {name!r}")
|
||||||
|
|
||||||
date = None
|
date = None
|
||||||
@@ -957,7 +957,7 @@ class IliasPage:
|
|||||||
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
|
if "ass_id=" not in href or "cmdclass=ilassignmentpresentationgui" not in href.lower():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
name = _sanitize_path_name(exercise.get_text().strip())
|
name = sanitize_path_name(exercise.get_text().strip())
|
||||||
results.append(
|
results.append(
|
||||||
IliasPageElement.create_new(
|
IliasPageElement.create_new(
|
||||||
IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
|
IliasElementType.EXERCISE, self._abs_url_from_link(exercise), name
|
||||||
@@ -983,12 +983,12 @@ class IliasPage:
|
|||||||
for link in links:
|
for link in links:
|
||||||
abs_url = self._abs_url_from_link(link)
|
abs_url = self._abs_url_from_link(link)
|
||||||
# Make sure parents are sanitized. We do not want accidental parents
|
# Make sure parents are sanitized. We do not want accidental parents
|
||||||
parents = [_sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
|
parents = [sanitize_path_name(x) for x in IliasPage._find_upwards_folder_hierarchy(link)]
|
||||||
|
|
||||||
if parents:
|
if parents:
|
||||||
element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text())
|
element_name = "/".join(parents) + "/" + sanitize_path_name(link.get_text())
|
||||||
else:
|
else:
|
||||||
element_name = _sanitize_path_name(link.get_text())
|
element_name = sanitize_path_name(link.get_text())
|
||||||
|
|
||||||
element_type = IliasPage._find_type_for_element(
|
element_type = IliasPage._find_type_for_element(
|
||||||
element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
|
element_name, abs_url, lambda: IliasPage._find_icon_for_folder_entry(link)
|
||||||
@@ -1053,7 +1053,7 @@ class IliasPage:
|
|||||||
IliasPageElement.create_new(
|
IliasPageElement.create_new(
|
||||||
typ=IliasElementType.MEDIACAST_VIDEO,
|
typ=IliasElementType.MEDIACAST_VIDEO,
|
||||||
url=self._abs_url_from_relative(cast(str, url)),
|
url=self._abs_url_from_relative(cast(str, url)),
|
||||||
name=_sanitize_path_name(title),
|
name=sanitize_path_name(title),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1081,7 +1081,7 @@ class IliasPage:
|
|||||||
|
|
||||||
videos.append(
|
videos.append(
|
||||||
IliasPageElement.create_new(
|
IliasPageElement.create_new(
|
||||||
typ=IliasElementType.MOB_VIDEO, url=url, name=_sanitize_path_name(title), mtime=None
|
typ=IliasElementType.MOB_VIDEO, url=url, name=sanitize_path_name(title), mtime=None
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1192,7 +1192,7 @@ class IliasPage:
|
|||||||
)
|
)
|
||||||
found_titles.append(head_tag.get_text().strip())
|
found_titles.append(head_tag.get_text().strip())
|
||||||
|
|
||||||
return [_sanitize_path_name(x) for x in reversed(found_titles)]
|
return [sanitize_path_name(x) for x in reversed(found_titles)]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _find_link_description(link: Tag) -> Optional[str]:
|
def _find_link_description(link: Tag) -> Optional[str]:
|
||||||
@@ -1247,7 +1247,7 @@ class IliasPage:
|
|||||||
|
|
||||||
for title in card_titles:
|
for title in card_titles:
|
||||||
url = self._abs_url_from_link(title)
|
url = self._abs_url_from_link(title)
|
||||||
name = _sanitize_path_name(title.get_text().strip())
|
name = sanitize_path_name(title.get_text().strip())
|
||||||
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
|
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(title))
|
||||||
|
|
||||||
if not typ:
|
if not typ:
|
||||||
@@ -1274,7 +1274,7 @@ class IliasPage:
|
|||||||
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
|
log.warn_contd(f"Could not find click handler target for signal {signal} for {button}")
|
||||||
continue
|
continue
|
||||||
url = self._abs_url_from_relative(open_match.group(1))
|
url = self._abs_url_from_relative(open_match.group(1))
|
||||||
name = _sanitize_path_name(button.get_text().strip())
|
name = sanitize_path_name(button.get_text().strip())
|
||||||
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
|
typ = IliasPage._find_type_for_element(name, url, lambda: IliasPage._find_icon_from_card(button))
|
||||||
caption_parent = cast(
|
caption_parent = cast(
|
||||||
Tag,
|
Tag,
|
||||||
@@ -1532,10 +1532,6 @@ def _tomorrow() -> date:
|
|||||||
return date.today() + timedelta(days=1)
|
return date.today() + timedelta(days=1)
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_path_name(name: str) -> str:
|
|
||||||
return name.replace("/", "-").replace("\\", "-").strip()
|
|
||||||
|
|
||||||
|
|
||||||
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
|
def parse_ilias_forum_export(forum_export: BeautifulSoup) -> list[IliasForumThread]:
|
||||||
elements = []
|
elements = []
|
||||||
for p in forum_export.select("body > p"):
|
for p in forum_export.select("body > p"):
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from ..auth import Authenticator
|
|||||||
from ..config import Config
|
from ..config import Config
|
||||||
from ..logging import ProgressBar, log
|
from ..logging import ProgressBar, log
|
||||||
from ..output_dir import FileSink
|
from ..output_dir import FileSink
|
||||||
from ..utils import soupify
|
from ..utils import sanitize_path_name, soupify
|
||||||
from .crawler import CrawlError
|
from .crawler import CrawlError
|
||||||
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
|
|
||||||
@@ -106,7 +106,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None:
|
async def _crawl_folder(self, parent: PurePath, folder: KitIpdFolder) -> None:
|
||||||
path = parent / folder.name
|
path = parent / sanitize_path_name(folder.name)
|
||||||
if not await self.crawl(path):
|
if not await self.crawl(path):
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -125,7 +125,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
async def _download_file(
|
async def _download_file(
|
||||||
self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime]
|
self, parent: PurePath, file: KitIpdFile, etag: Optional[str], mtime: Optional[datetime]
|
||||||
) -> None:
|
) -> None:
|
||||||
element_path = parent / file.name
|
element_path = parent / sanitize_path_name(file.name)
|
||||||
|
|
||||||
prev_etag = self._get_previous_etag_from_report(element_path)
|
prev_etag = self._get_previous_etag_from_report(element_path)
|
||||||
etag_differs = None if prev_etag is None else prev_etag != etag
|
etag_differs = None if prev_etag is None else prev_etag != etag
|
||||||
|
|||||||
@@ -106,6 +106,10 @@ def fmt_real_path(path: Path) -> str:
|
|||||||
return repr(str(path.absolute()))
|
return repr(str(path.absolute()))
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_path_name(name: str) -> str:
|
||||||
|
return name.replace("/", "-").replace("\\", "-").strip()
|
||||||
|
|
||||||
|
|
||||||
class ReusableAsyncContextManager(ABC, Generic[T]):
|
class ReusableAsyncContextManager(ABC, Generic[T]):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._active = False
|
self._active = False
|
||||||
|
|||||||
Reference in New Issue
Block a user