From ee67f9f4725be9f418d66b85bb8a749de8e5d713 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 6 Jul 2021 17:45:12 +0200 Subject: [PATCH] Sort elements by ILIAS id to ensure deterministic ordering --- PFERD/crawl/ilias/kit_ilias_html.py | 11 +++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 247002b..7e91926 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -38,6 +38,17 @@ class IliasPageElement: mtime: Optional[datetime] = None description: Optional[str] = None + def id(self) -> str: + regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + + for regex in regexes: + if match := re.search(regex, self.url): + return match.groupdict()["id"] + + # Fall back to URL + log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") + return self.url + class IliasPage: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a0e323b..cca6987 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -230,6 +230,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(PurePath("."), element): @@ -280,6 +282,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(cl.path, element):