From 8cfa818f04e97713ffd15f9a39e07728211042d8 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Wed, 19 May 2021 21:57:55 +0200
Subject: [PATCH] Only call should_crawl once

---
 PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
index 9c7793c..82ca8d7 100644
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -130,6 +130,12 @@ class KitIliasWebCrawler(HttpCrawler):
     @arepeat(3)
     @anoncritical
     async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
+        # We might not want to crawl this directory-ish page.
+        # This is not in #handle_element, as the download methods check it themselves and therefore
+        # would perform this check twice - messing with the explain output
+        if not self.should_crawl(path):
+            return
+
         tasks = []
         async with self.crawl_bar(path):
             soup = await self._get_page(url)