mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
De-duplicate element names in ILIAS crawler
This prevents any conflicts caused by multiple files with the same name. Conflicts may still arise due to transforms, but that is out of our control and a user error.
This commit is contained in:
@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
|
||||
from ..crawler import CrawlError, CrawlWarning, anoncritical
|
||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||
from .file_templates import link_template_plain, link_template_rich
|
||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names
|
||||
|
||||
TargetType = Union[str, int]
|
||||
|
||||
@ -214,6 +214,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
|
||||
# Fill up our task list with the found elements
|
||||
await gather_elements()
|
||||
elements = deduplicate_element_names(elements)
|
||||
tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
|
||||
|
||||
# And execute them
|
||||
@ -240,6 +241,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
|
||||
# Fill up our task list with the found elements
|
||||
await gather_elements()
|
||||
elements = deduplicate_element_names(elements)
|
||||
tasks = [self._handle_ilias_element(path, element) for element in elements]
|
||||
|
||||
# And execute them
|
||||
|
Reference in New Issue
Block a user