Use cl/dl deduplication mechanism for ILIAS crawler

2025-07-12 06:02:31 +02:00 · 2021-05-25 12:15:38 +02:00
parent bce3dc384d
commit 651b087932
2 changed files with 2 additions and 56 deletions
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -567,55 +567,3 @@ def _tomorrow() -> date:

 def _sanitize_path_name(name: str) -> str:
    return name.replace("/", "-").replace("\\", "-").strip()
-
-
-def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
-    """
-    De-duplicates element names by appending an incrementing number to later elements:
-      test.pdf
-      test.pdf
-    would result in
-      test.pdf
-      test_1.pdf
-
-    It is also space-aware:
-      "te st.pdf"
-      "te st.pdf"
-    would result in
-      "tes st.pdf"
-      "tes st 1.pdf"
-    """
-    known_names = dict()
-    result_elements = []
-
-    for element in elements:
-        # This file is new - add it and mark its name as used
-        if element.name not in known_names:
-            known_names[element.name] = 1
-            result_elements.append(element)
-            continue
-
-        # This file is a duplicate. Find a suitable suffix
-        current_counter = known_names[element.name]
-        adjusted_element = _append_number(element, current_counter)
-        # increment the counter so the next duplicate does not conflict
-        known_names[element.name] += 1
-        # also block the new name, so another file with the *renamed* name gets renamed as well
-        known_names[adjusted_element.name] = 1
-
-        result_elements.append(adjusted_element)
-
-    return result_elements
-
-
-def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
-    extension_index = element.name.rfind(".")
-    suffix = f" {number}" if " " in element.name else f"_{number}"
-    if extension_index < 0:
-        new_name = element.name + suffix
-    else:
-        new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
-
-    return IliasPageElement(
-        element.type, element.url, new_name, element.mtime, element.description
-    )
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
 from ..crawler import CrawlError, CrawlWarning, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import Links
-from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
+from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement

 TargetType = Union[str, int]

@ -226,7 +226,6 @@ class KitIliasWebCrawler(HttpCrawler):

        # Fill up our task list with the found elements
        await gather_elements()
-        elements = deduplicate_element_names(elements)
        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]

        # And execute them
@ -253,8 +252,7 @@ class KitIliasWebCrawler(HttpCrawler):

        # Fill up our task list with the found elements
        await gather_elements()
-        elements = deduplicate_element_names(elements)
-        tasks = [self._handle_ilias_element(path, element) for element in elements]
+        tasks = [self._handle_ilias_element(cl.path, element) for element in elements]

        # And execute them
        await self.gather(tasks)