De-duplicate element names in ILIAS crawler

This prevents any conflicts caused by multiple files with the same name. Conflicts may still arise due to transforms, but that is out of our control and a user error.
2026-01-09 22:12:30 +01:00 · 2021-05-24 00:24:31 +02:00
parent 3ab3581f84
commit fca62541ca
2 changed files with 55 additions and 1 deletions
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -461,3 +461,55 @@ def _tomorrow() -> date:
 def _sanitize_path_name(name: str) -> str:
    return name.replace("/", "-").replace("\\", "-").strip()
 def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
    """
    De-duplicates element names by appending an incrementing number to later elements:
      test.pdf
      test.pdf
    would result in
      test.pdf
      test_1.pdf
    It is also space-aware:
      "te st.pdf"
      "te st.pdf"
    would result in
      "tes st.pdf"
      "tes st 1.pdf"
    """
    known_names = dict()
    result_elements = []
    for element in elements:
        # This file is new - add it and mark its name as used
        if element.name not in known_names:
            known_names[element.name] = 1
            result_elements.append(element)
            continue
        # This file is a duplicate. Find a suitable suffix
        current_counter = known_names[element.name]
        adjusted_element = _append_number(element, current_counter)
        # increment the counter so the next duplicate does not conflict
        known_names[element.name] += 1
        # also block the new name, so another file with the *renamed* name gets renamed as well
        known_names[adjusted_element.name] = 1
        result_elements.append(adjusted_element)
    return result_elements
 def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
    extension_index = element.name.rfind(".")
    suffix = f" {number}" if " " in element.name else f"_{number}"
    if extension_index < 0:
        new_name = element.name + suffix
    else:
        new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
    return IliasPageElement(
        element.type, element.url, new_name, element.mtime, element.description
    )
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import CrawlError, CrawlWarning, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import link_template_plain, link_template_rich
-from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
+from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
 TargetType = Union[str, int]
@@ -214,6 +214,7 @@ class KitIliasWebCrawler(HttpCrawler):
        # Fill up our task list with the found elements
        await gather_elements()
        elements = deduplicate_element_names(elements)
        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
        # And execute them
@@ -240,6 +241,7 @@ class KitIliasWebCrawler(HttpCrawler):
        # Fill up our task list with the found elements
        await gather_elements()
        elements = deduplicate_element_names(elements)
        tasks = [self._handle_ilias_element(path, element) for element in elements]
        # And execute them