From 8cfa818f04e97713ffd15f9a39e07728211042d8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 May 2021 21:57:55 +0200 Subject: [PATCH] Only call should_crawl once --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 9c7793c..82ca8d7 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -130,6 +130,12 @@ class KitIliasWebCrawler(HttpCrawler): @arepeat(3) @anoncritical async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: + # We might not want to crawl this directory-ish page. + # This is not in #handle_element, as the download methods check it themselves and therefore + # would perform this check twice - messing with the explain output + if not self.should_crawl(path): + return + tasks = [] async with self.crawl_bar(path): soup = await self._get_page(url)