From 651b0879320500927a13f732b0bc070afbfa3ac2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 12:15:38 +0200 Subject: [PATCH] Use cl/dl deduplication mechanism for ILIAS crawler --- PFERD/crawl/ilias/kit_ilias_html.py | 52 ---------------------- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 +-- 2 files changed, 2 insertions(+), 56 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 43d66b5..032bb27 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -567,55 +567,3 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() - - -def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]: - """ - De-duplicates element names by appending an incrementing number to later elements: - test.pdf - test.pdf - would result in - test.pdf - test_1.pdf - - It is also space-aware: - "te st.pdf" - "te st.pdf" - would result in - "tes st.pdf" - "tes st 1.pdf" - """ - known_names = dict() - result_elements = [] - - for element in elements: - # This file is new - add it and mark its name as used - if element.name not in known_names: - known_names[element.name] = 1 - result_elements.append(element) - continue - - # This file is a duplicate. Find a suitable suffix - current_counter = known_names[element.name] - adjusted_element = _append_number(element, current_counter) - # increment the counter so the next duplicate does not conflict - known_names[element.name] += 1 - # also block the new name, so another file with the *renamed* name gets renamed as well - known_names[adjusted_element.name] = 1 - - result_elements.append(adjusted_element) - - return result_elements - - -def _append_number(element: IliasPageElement, number: int) -> IliasPageElement: - extension_index = element.name.rfind(".") - suffix = f" {number}" if " " in element.name else f"_{number}" - if extension_index < 0: - new_name = element.name + suffix - else: - new_name = element.name[:extension_index] + suffix + element.name[extension_index:] - - return IliasPageElement( - element.type, element.url, new_name, element.mtime, element.description - ) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 318fa5e..daafc12 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlWarning, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names +from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -226,7 +226,6 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() - elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] # And execute them @@ -253,8 +252,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() - elements = deduplicate_element_names(elements) - tasks = [self._handle_ilias_element(path, element) for element in elements] + tasks = [self._handle_ilias_element(cl.path, element) for element in elements] # And execute them await self.gather(tasks)