From bef210ae773d1031affa08599d1fa383f552ab66 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 23 Apr 2020 12:33:38 +0200 Subject: [PATCH] Rename and implement IliasDirectoryFilter --- PFERD/ilias/__init__.py | 2 +- PFERD/ilias/crawler.py | 37 +++++++++++++++++++++++-------------- PFERD/pferd.py | 12 ++++++------ PFERD/utils.py | 20 +++++++++++++++----- 4 files changed, 45 insertions(+), 26 deletions(-) diff --git a/PFERD/ilias/__init__.py b/PFERD/ilias/__init__.py index 3a983d0..9bda6f8 100644 --- a/PFERD/ilias/__init__.py +++ b/PFERD/ilias/__init__.py @@ -3,5 +3,5 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/). """ from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator -from .crawler import IliasCrawler, IliasFilter +from .crawler import IliasCrawler, IliasDirectoryFilter from .downloader import IliasDownloader diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index f6b98c9..dc657f1 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -15,15 +15,15 @@ import bs4 import requests from ..cookie_jar import CookieJar -from ..utils import soupify +from ..utils import PrettyLogger, soupify from .authenticators import IliasAuthenticator from .date_demangler import demangle_date from .downloader import IliasDownloadInfo LOGGER = logging.getLogger(__name__) +PRETTY = PrettyLogger(LOGGER) - -IliasFilter = Callable[[Path], bool] +IliasDirectoryFilter = Callable[[Path], bool] class IliasCrawler: @@ -36,13 +36,14 @@ class IliasCrawler: A crawler for ILIAS. """ + # pylint: disable=too-many-arguments def __init__( self, base_url: str, course_id: str, session: requests.Session, authenticator: IliasAuthenticator, - filter_: IliasFilter + dir_filter: IliasDirectoryFilter ): """ Create a new ILIAS crawler. @@ -52,7 +53,7 @@ class IliasCrawler: self._course_id = course_id self._session = session self._authenticator = authenticator - self._filter = filter_ + self.dir_filter = dir_filter def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: """ @@ -153,13 +154,23 @@ class IliasCrawler: def _switch_on_folder_like( self, - path: Path, + parent_path: Path, link_element: bs4.Tag, url: str ) -> List[IliasDownloadInfo]: """ Try crawling something that looks like a folder. """ + # pylint: disable=too-many-return-statements + + element_path = Path(parent_path, link_element.getText().strip()) + + if not self.dir_filter(element_path): + PRETTY.filtered_path(element_path) + return [] + + LOGGER.info("Searching %r", str(element_path)) + found_parent: Optional[bs4.Tag] = None # We look for the outer div of our inner link, to find information around it @@ -185,8 +196,6 @@ class IliasCrawler: LOGGER.debug("Skipping forum at %r", url) return [] - element_path = Path(path, link_element.getText().strip()) - # An exercise if str(img_tag["src"]).endswith("icon_exc.svg"): LOGGER.debug("Crawling exercises at %r", url) @@ -200,7 +209,7 @@ class IliasCrawler: # Assume it is a folder return self._crawl_folder(element_path, self._abs_url_from_link(link_element)) - def _crawl_video_directory(self, path: Path, url: str) -> List[IliasDownloadInfo]: + def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasDownloadInfo]: """ Crawl the video overview site. """ @@ -224,11 +233,11 @@ class IliasCrawler: results: List[IliasDownloadInfo] = [] for link in video_links: - results += self._crawl_single_video(path, link) + results += self._crawl_single_video(video_dir_path, link) return results - def _crawl_single_video(self, path: Path, link: bs4.Tag) -> List[IliasDownloadInfo]: + def _crawl_single_video(self, parent_path: Path, link: bs4.Tag) -> List[IliasDownloadInfo]: """ Crawl a single video based on its "Abspielen" link from the video listing. """ @@ -267,7 +276,7 @@ class IliasCrawler: # and fetch the video url! video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] - return [IliasDownloadInfo(Path(path, title), video_url, modification_time)] + return [IliasDownloadInfo(Path(parent_path, title), video_url, modification_time)] def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasDownloadInfo]: """ @@ -318,7 +327,7 @@ class IliasCrawler: return results - def _crawl_folder(self, path: Path, url: str) -> List[IliasDownloadInfo]: + def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasDownloadInfo]: """ Crawl all files in a folder-like element. """ @@ -330,7 +339,7 @@ class IliasCrawler: links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") for link in links: abs_url = self._abs_url_from_link(link) - result += self._switch_on_crawled_type(path, link, abs_url) + result += self._switch_on_crawled_type(folder_path, link, abs_url) return result diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 5fd39ef..adbc9bf 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -2,8 +2,8 @@ from pathlib import Path from typing import Optional from .cookie_jar import CookieJar -from .ilias import (IliasAuthenticator, IliasCrawler, IliasDownloader, - IliasFilter, KitShibbolethAuthenticator) +from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, + IliasDownloader, KitShibbolethAuthenticator) from .organizer import Organizer from .tmp_dir import TmpDir from .transform import Transform, apply_transform @@ -25,7 +25,7 @@ class Pferd(Location): course_id: str, authenticator: IliasAuthenticator, cookies: Optional[Path], - filter_: IliasFilter, + dir_filter: IliasDirectoryFilter, transform: Transform, ) -> None: cookie_jar = CookieJar(cookies) @@ -33,7 +33,7 @@ class Pferd(Location): tmp_dir = self._tmp_dir.new_subdir() organizer = Organizer(self.resolve(target)) - crawler = IliasCrawler(base_url, course_id, session, authenticator, filter_) + crawler = IliasCrawler(base_url, course_id, session, authenticator, dir_filter) downloader = IliasDownloader(tmp_dir, organizer, session, authenticator) cookie_jar.load_cookies() @@ -46,7 +46,7 @@ class Pferd(Location): self, target: Path, course_id: str, - filter_: IliasFilter = lambda x: True, + dir_filter: IliasDirectoryFilter = lambda x: True, transform: Transform = lambda x: x, cookies: Optional[Path] = None, username: Optional[str] = None, @@ -60,6 +60,6 @@ class Pferd(Location): course_id=course_id, authenticator=authenticator, cookies=cookies, - filter_=filter_, + dir_filter=dir_filter, transform=transform, ) diff --git a/PFERD/utils.py b/PFERD/utils.py index 7b8e86c..730c21c 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -127,7 +127,8 @@ class PrettyLogger: """ self.logger.info( - f"{Fore.MAGENTA}{Style.BRIGHT}Modified {file_name}.{Style.RESET_ALL}") + f"{Fore.MAGENTA}{Style.BRIGHT}Modified {str(file_name)!r}.{Style.RESET_ALL}" + ) def new_file(self, file_name: Path) -> None: """ @@ -135,14 +136,23 @@ class PrettyLogger: """ self.logger.info( - f"{Fore.GREEN}{Style.BRIGHT}Created {file_name}.{Style.RESET_ALL}") + f"{Fore.GREEN}{Style.BRIGHT}Created {str(file_name)!r}.{Style.RESET_ALL}") def ignored_file(self, file_name: Path) -> None: """ - Nothing in particular happened to this file. + Nothing in particular happened to this file or directory. """ - self.logger.info(f"{Style.DIM}Ignored {file_name}.{Style.RESET_ALL}") + self.logger.info(f"{Style.DIM}Ignored {str(file_name)!r}.{Style.RESET_ALL}") + + def filtered_path(self, path: Path) -> None: + """ + A crawler filter rejected the given path. + """ + + self.logger.info( + f"{Style.DIM}Not considering {str(path)!r} due to filter rules.{Style.RESET_ALL}" + ) def starting_synchronizer( self, @@ -157,6 +167,6 @@ class PrettyLogger: subject_str = f"{subject} " if subject else "" self.logger.info("") self.logger.info(( - f"{Fore.CYAN}{Style.BRIGHT}Synchronizing {subject_str}to {target_directory}" + f"{Fore.CYAN}{Style.BRIGHT}Synchronizing {subject_str}to {str(target_directory)!r}" f" using the {synchronizer_name} synchronizer.{Style.RESET_ALL}" ))