De-duplicate element names in ILIAS crawler

This prevents any conflicts caused by multiple files with the same name.
Conflicts may still arise due to transforms, but that is out of our
control and a user error.
This commit is contained in:
I-Al-Istannen
2021-05-24 00:24:31 +02:00
parent 3ab3581f84
commit fca62541ca
2 changed files with 55 additions and 1 deletions

View File

@ -461,3 +461,55 @@ def _tomorrow() -> date:
def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip()
def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
"""
De-duplicates element names by appending an incrementing number to later elements:
test.pdf
test.pdf
would result in
test.pdf
test_1.pdf
It is also space-aware:
"te st.pdf"
"te st.pdf"
would result in
"tes st.pdf"
"tes st 1.pdf"
"""
known_names = dict()
result_elements = []
for element in elements:
# This file is new - add it and mark its name as used
if element.name not in known_names:
known_names[element.name] = 1
result_elements.append(element)
continue
# This file is a duplicate. Find a suitable suffix
current_counter = known_names[element.name]
adjusted_element = _append_number(element, current_counter)
# increment the counter so the next duplicate does not conflict
known_names[element.name] += 1
# also block the new name, so another file with the *renamed* name gets renamed as well
known_names[adjusted_element.name] = 1
result_elements.append(adjusted_element)
return result_elements
def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
extension_index = element.name.rfind(".")
suffix = f" {number}" if " " in element.name else f"_{number}"
if extension_index < 0:
new_name = element.name + suffix
else:
new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
return IliasPageElement(
element.type, element.url, new_name, element.mtime, element.description
)