mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Use cl/dl deduplication mechanism for ILIAS crawler
This commit is contained in:
parent
bce3dc384d
commit
651b087932
@ -567,55 +567,3 @@ def _tomorrow() -> date:
|
|||||||
|
|
||||||
def _sanitize_path_name(name: str) -> str:
|
def _sanitize_path_name(name: str) -> str:
|
||||||
return name.replace("/", "-").replace("\\", "-").strip()
|
return name.replace("/", "-").replace("\\", "-").strip()
|
||||||
|
|
||||||
|
|
||||||
def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]:
|
|
||||||
"""
|
|
||||||
De-duplicates element names by appending an incrementing number to later elements:
|
|
||||||
test.pdf
|
|
||||||
test.pdf
|
|
||||||
would result in
|
|
||||||
test.pdf
|
|
||||||
test_1.pdf
|
|
||||||
|
|
||||||
It is also space-aware:
|
|
||||||
"te st.pdf"
|
|
||||||
"te st.pdf"
|
|
||||||
would result in
|
|
||||||
"tes st.pdf"
|
|
||||||
"tes st 1.pdf"
|
|
||||||
"""
|
|
||||||
known_names = dict()
|
|
||||||
result_elements = []
|
|
||||||
|
|
||||||
for element in elements:
|
|
||||||
# This file is new - add it and mark its name as used
|
|
||||||
if element.name not in known_names:
|
|
||||||
known_names[element.name] = 1
|
|
||||||
result_elements.append(element)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# This file is a duplicate. Find a suitable suffix
|
|
||||||
current_counter = known_names[element.name]
|
|
||||||
adjusted_element = _append_number(element, current_counter)
|
|
||||||
# increment the counter so the next duplicate does not conflict
|
|
||||||
known_names[element.name] += 1
|
|
||||||
# also block the new name, so another file with the *renamed* name gets renamed as well
|
|
||||||
known_names[adjusted_element.name] = 1
|
|
||||||
|
|
||||||
result_elements.append(adjusted_element)
|
|
||||||
|
|
||||||
return result_elements
|
|
||||||
|
|
||||||
|
|
||||||
def _append_number(element: IliasPageElement, number: int) -> IliasPageElement:
|
|
||||||
extension_index = element.name.rfind(".")
|
|
||||||
suffix = f" {number}" if " " in element.name else f"_{number}"
|
|
||||||
if extension_index < 0:
|
|
||||||
new_name = element.name + suffix
|
|
||||||
else:
|
|
||||||
new_name = element.name[:extension_index] + suffix + element.name[extension_index:]
|
|
||||||
|
|
||||||
return IliasPageElement(
|
|
||||||
element.type, element.url, new_name, element.mtime, element.description
|
|
||||||
)
|
|
||||||
|
@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
|
|||||||
from ..crawler import CrawlError, CrawlWarning, anoncritical
|
from ..crawler import CrawlError, CrawlWarning, anoncritical
|
||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
from .file_templates import Links
|
from .file_templates import Links
|
||||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
|
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
||||||
|
|
||||||
TargetType = Union[str, int]
|
TargetType = Union[str, int]
|
||||||
|
|
||||||
@ -226,7 +226,6 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
|
|
||||||
# Fill up our task list with the found elements
|
# Fill up our task list with the found elements
|
||||||
await gather_elements()
|
await gather_elements()
|
||||||
elements = deduplicate_element_names(elements)
|
|
||||||
tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
|
tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
|
||||||
|
|
||||||
# And execute them
|
# And execute them
|
||||||
@ -253,8 +252,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
|
|
||||||
# Fill up our task list with the found elements
|
# Fill up our task list with the found elements
|
||||||
await gather_elements()
|
await gather_elements()
|
||||||
elements = deduplicate_element_names(elements)
|
tasks = [self._handle_ilias_element(cl.path, element) for element in elements]
|
||||||
tasks = [self._handle_ilias_element(path, element) for element in elements]
|
|
||||||
|
|
||||||
# And execute them
|
# And execute them
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
Loading…
Reference in New Issue
Block a user