Use cl/dl deduplication mechanism for ILIAS crawler

This commit is contained in:
I-Al-Istannen 2021-05-25 12:15:38 +02:00
parent bce3dc384d
commit 651b087932
2 changed files with 2 additions and 56 deletions

View File

@ -567,55 +567,3 @@ def _tomorrow() -> date:
def _sanitize_path_name(name: str) -> str: def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip() return name.replace("/", "-").replace("\\", "-").strip()
def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
"""
De-duplicates element names by appending an incrementing number to later elements:
test.pdf
test.pdf
would result in
test.pdf
test_1.pdf
It is also space-aware:
"te st.pdf"
"te st.pdf"
would result in
"tes st.pdf"
"tes st 1.pdf"
"""
known_names = dict()
result_elements = []
for element in elements:
# This file is new - add it and mark its name as used
if element.name not in known_names:
known_names[element.name] = 1
result_elements.append(element)
continue
# This file is a duplicate. Find a suitable suffix
current_counter = known_names[element.name]
adjusted_element = _append_number(element, current_counter)
# increment the counter so the next duplicate does not conflict
known_names[element.name] += 1
# also block the new name, so another file with the *renamed* name gets renamed as well
known_names[adjusted_element.name] = 1
result_elements.append(adjusted_element)
return result_elements
def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
extension_index = element.name.rfind(".")
suffix = f" {number}" if " " in element.name else f"_{number}"
if extension_index < 0:
new_name = element.name + suffix
else:
new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
return IliasPageElement(
element.type, element.url, new_name, element.mtime, element.description
)

View File

@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlWarning, anoncritical from ..crawler import CrawlError, CrawlWarning, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import Links from .file_templates import Links
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
TargetType = Union[str, int] TargetType = Union[str, int]
@ -226,7 +226,6 @@ class KitIliasWebCrawler(HttpCrawler):
# Fill up our task list with the found elements # Fill up our task list with the found elements
await gather_elements() await gather_elements()
elements = deduplicate_element_names(elements)
tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
# And execute them # And execute them
@ -253,8 +252,7 @@ class KitIliasWebCrawler(HttpCrawler):
# Fill up our task list with the found elements # Fill up our task list with the found elements
await gather_elements() await gather_elements()
elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(cl.path, element) for element in elements]
tasks = [self._handle_ilias_element(path, element) for element in elements]
# And execute them # And execute them
await self.gather(tasks) await self.gather(tasks)