diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 636fa68..61df57a 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -461,3 +461,55 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() + + +def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]: + """ + De-duplicates element names by appending an incrementing number to later elements: + test.pdf + test.pdf + would result in + test.pdf + test_1.pdf + + It is also space-aware: + "te st.pdf" + "te st.pdf" + would result in + "tes st.pdf" + "tes st 1.pdf" + """ + known_names = dict() + result_elements = [] + + for element in elements: + # This file is new - add it and mark its name as used + if element.name not in known_names: + known_names[element.name] = 1 + result_elements.append(element) + continue + + # This file is a duplicate. Find a suitable suffix + current_counter = known_names[element.name] + adjusted_element = _append_number(element, current_counter) + # increment the counter so the next duplicate does not conflict + known_names[element.name] += 1 + # also block the new name, so another file with the *renamed* name gets renamed as well + known_names[adjusted_element.name] = 1 + + result_elements.append(adjusted_element) + + return result_elements + + +def _append_number(element: IliasPageElement, number: int) -> IliasPageElement: + extension_index = element.name.rfind(".") + suffix = f" {number}" if " " in element.name else f"_{number}" + if extension_index < 0: + new_name = element.name + suffix + else: + new_name = element.name[:extension_index] + suffix + element.name[extension_index:] + + return IliasPageElement( + element.type, element.url, new_name, element.mtime, element.description + ) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 445997f..222e1d6 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlWarning, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import link_template_plain, link_template_rich -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement +from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names TargetType = Union[str, int] @@ -214,6 +214,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() + elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] # And execute them @@ -240,6 +241,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() + elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(path, element) for element in elements] # And execute them