mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
De-duplicate element names in ILIAS crawler
This prevents any conflicts caused by multiple files with the same name. Conflicts may still arise due to transforms, but that is out of our control and a user error.
This commit is contained in:
parent
3ab3581f84
commit
fca62541ca
@ -461,3 +461,55 @@ def _tomorrow() -> date:
|
||||
|
||||
def _sanitize_path_name(name: str) -> str:
|
||||
return name.replace("/", "-").replace("\\", "-").strip()
|
||||
|
||||
|
||||
def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
|
||||
"""
|
||||
De-duplicates element names by appending an incrementing number to later elements:
|
||||
test.pdf
|
||||
test.pdf
|
||||
would result in
|
||||
test.pdf
|
||||
test_1.pdf
|
||||
|
||||
It is also space-aware:
|
||||
"te st.pdf"
|
||||
"te st.pdf"
|
||||
would result in
|
||||
"tes st.pdf"
|
||||
"tes st 1.pdf"
|
||||
"""
|
||||
known_names = dict()
|
||||
result_elements = []
|
||||
|
||||
for element in elements:
|
||||
# This file is new - add it and mark its name as used
|
||||
if element.name not in known_names:
|
||||
known_names[element.name] = 1
|
||||
result_elements.append(element)
|
||||
continue
|
||||
|
||||
# This file is a duplicate. Find a suitable suffix
|
||||
current_counter = known_names[element.name]
|
||||
adjusted_element = _append_number(element, current_counter)
|
||||
# increment the counter so the next duplicate does not conflict
|
||||
known_names[element.name] += 1
|
||||
# also block the new name, so another file with the *renamed* name gets renamed as well
|
||||
known_names[adjusted_element.name] = 1
|
||||
|
||||
result_elements.append(adjusted_element)
|
||||
|
||||
return result_elements
|
||||
|
||||
|
||||
def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
|
||||
extension_index = element.name.rfind(".")
|
||||
suffix = f" {number}" if " " in element.name else f"_{number}"
|
||||
if extension_index < 0:
|
||||
new_name = element.name + suffix
|
||||
else:
|
||||
new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
|
||||
|
||||
return IliasPageElement(
|
||||
element.type, element.url, new_name, element.mtime, element.description
|
||||
)
|
||||
|
@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
|
||||
from ..crawler import CrawlError, CrawlWarning, anoncritical
|
||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||
from .file_templates import link_template_plain, link_template_rich
|
||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
|
||||
|
||||
TargetType = Union[str, int]
|
||||
|
||||
@ -214,6 +214,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
|
||||
# Fill up our task list with the found elements
|
||||
await gather_elements()
|
||||
elements = deduplicate_element_names(elements)
|
||||
tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
|
||||
|
||||
# And execute them
|
||||
@ -240,6 +241,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
|
||||
# Fill up our task list with the found elements
|
||||
await gather_elements()
|
||||
elements = deduplicate_element_names(elements)
|
||||
tasks = [self._handle_ilias_element(path, element) for element in elements]
|
||||
|
||||
# And execute them
|
||||
|
Loading…
x
Reference in New Issue
Block a user