diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 247002b..7e91926 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -38,6 +38,17 @@ class IliasPageElement: mtime: Optional[datetime] = None description: Optional[str] = None + def id(self) -> str: + regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + + for regex in regexes: + if match := re.search(regex, self.url): + return match.groupdict()["id"] + + # Fall back to URL + log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") + return self.url + class IliasPage: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a0e323b..cca6987 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -230,6 +230,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(PurePath("."), element): @@ -280,6 +282,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(cl.path, element):