From 13bc78c889860ecaf46a68f5e274cb44c9c37082 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 23 Apr 2020 13:54:58 +0200 Subject: [PATCH] Display reason for ignoring an element in ilias crawler --- PFERD/ilias/crawler.py | 8 +++++++- PFERD/utils.py | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index ad95d35..216be6e 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -162,7 +162,7 @@ class IliasCrawler: element_path = Path(parent_path, link_element.getText().strip()) if not self.dir_filter(element_path): - PRETTY.filtered_path(element_path) + PRETTY.filtered_path(element_path, "user filter") return [] LOGGER.info("Searching %r", str(element_path)) @@ -190,6 +190,7 @@ class IliasCrawler: # A forum if str(img_tag["src"]).endswith("frm.svg"): LOGGER.debug("Skipping forum at %r", url) + PRETTY.filtered_path(element_path, "forum") return [] # An exercise @@ -197,6 +198,11 @@ class IliasCrawler: LOGGER.debug("Crawling exercises at %r", url) return self._crawl_exercises(element_path, url) + if str(img_tag["src"]).endswith("icon_webr.svg"): + LOGGER.debug("Skipping external link at %r", url) + PRETTY.filtered_path(element_path, "external link") + return [] + # Match the opencast video plugin if "opencast" in str(img_tag["alt"]).lower(): LOGGER.debug("Found video site: %r", url) diff --git a/PFERD/utils.py b/PFERD/utils.py index 730c21c..7d32213 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -145,13 +145,15 @@ class PrettyLogger: self.logger.info(f"{Style.DIM}Ignored {str(file_name)!r}.{Style.RESET_ALL}") - def filtered_path(self, path: Path) -> None: + def filtered_path(self, path: Path, reason: str) -> None: """ A crawler filter rejected the given path. """ self.logger.info( - f"{Style.DIM}Not considering {str(path)!r} due to filter rules.{Style.RESET_ALL}" + f"{Style.DIM}Not considering {str(path)!r} due to filter rules" + f" ({Style.NORMAL}{reason}{Style.DIM})." + f"{Style.RESET_ALL}" ) def starting_synchronizer(