mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-04 14:42:49 +01:00 
			
		
		
		
	Use cl/dl deduplication mechanism for ILIAS crawler
This commit is contained in:
		@@ -567,55 +567,3 @@ def _tomorrow() -> date:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def _sanitize_path_name(name: str) -> str:
 | 
					def _sanitize_path_name(name: str) -> str:
 | 
				
			||||||
    return name.replace("/", "-").replace("\\", "-").strip()
 | 
					    return name.replace("/", "-").replace("\\", "-").strip()
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    De-duplicates element names by appending an incrementing number to later elements:
 | 
					 | 
				
			||||||
      test.pdf
 | 
					 | 
				
			||||||
      test.pdf
 | 
					 | 
				
			||||||
    would result in
 | 
					 | 
				
			||||||
      test.pdf
 | 
					 | 
				
			||||||
      test_1.pdf
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    It is also space-aware:
 | 
					 | 
				
			||||||
      "te st.pdf"
 | 
					 | 
				
			||||||
      "te st.pdf"
 | 
					 | 
				
			||||||
    would result in
 | 
					 | 
				
			||||||
      "tes st.pdf"
 | 
					 | 
				
			||||||
      "tes st 1.pdf"
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    known_names = dict()
 | 
					 | 
				
			||||||
    result_elements = []
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for element in elements:
 | 
					 | 
				
			||||||
        # This file is new - add it and mark its name as used
 | 
					 | 
				
			||||||
        if element.name not in known_names:
 | 
					 | 
				
			||||||
            known_names[element.name] = 1
 | 
					 | 
				
			||||||
            result_elements.append(element)
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # This file is a duplicate. Find a suitable suffix
 | 
					 | 
				
			||||||
        current_counter = known_names[element.name]
 | 
					 | 
				
			||||||
        adjusted_element = _append_number(element, current_counter)
 | 
					 | 
				
			||||||
        # increment the counter so the next duplicate does not conflict
 | 
					 | 
				
			||||||
        known_names[element.name] += 1
 | 
					 | 
				
			||||||
        # also block the new name, so another file with the *renamed* name gets renamed as well
 | 
					 | 
				
			||||||
        known_names[adjusted_element.name] = 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        result_elements.append(adjusted_element)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return result_elements
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
 | 
					 | 
				
			||||||
    extension_index = element.name.rfind(".")
 | 
					 | 
				
			||||||
    suffix = f" {number}" if " " in element.name else f"_{number}"
 | 
					 | 
				
			||||||
    if extension_index < 0:
 | 
					 | 
				
			||||||
        new_name = element.name + suffix
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return IliasPageElement(
 | 
					 | 
				
			||||||
        element.type, element.url, new_name, element.mtime, element.description
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
 | 
				
			|||||||
from ..crawler import CrawlError, CrawlWarning, anoncritical
 | 
					from ..crawler import CrawlError, CrawlWarning, anoncritical
 | 
				
			||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
 | 
					from ..http_crawler import HttpCrawler, HttpCrawlerSection
 | 
				
			||||||
from .file_templates import Links
 | 
					from .file_templates import Links
 | 
				
			||||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
 | 
					from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TargetType = Union[str, int]
 | 
					TargetType = Union[str, int]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -226,7 +226,6 @@ class KitIliasWebCrawler(HttpCrawler):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # Fill up our task list with the found elements
 | 
					        # Fill up our task list with the found elements
 | 
				
			||||||
        await gather_elements()
 | 
					        await gather_elements()
 | 
				
			||||||
        elements = deduplicate_element_names(elements)
 | 
					 | 
				
			||||||
        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
 | 
					        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # And execute them
 | 
					        # And execute them
 | 
				
			||||||
@@ -253,8 +252,7 @@ class KitIliasWebCrawler(HttpCrawler):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # Fill up our task list with the found elements
 | 
					        # Fill up our task list with the found elements
 | 
				
			||||||
        await gather_elements()
 | 
					        await gather_elements()
 | 
				
			||||||
        elements = deduplicate_element_names(elements)
 | 
					        tasks = [self._handle_ilias_element(cl.path, element) for element in elements]
 | 
				
			||||||
        tasks = [self._handle_ilias_element(path, element) for element in elements]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # And execute them
 | 
					        # And execute them
 | 
				
			||||||
        await self.gather(tasks)
 | 
					        await self.gather(tasks)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user