From 9850ab1d73f3de722c1d5af0b3736333c31b5b72 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 10 May 2020 12:16:42 +0200 Subject: [PATCH] Allow crawling the ILIAS Personal Desktop --- PFERD/ilias/crawler.py | 31 +++++++++++++++------ PFERD/pferd.py | 63 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 0f7d4f6..8f48973 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -38,7 +38,6 @@ class IliasCrawler: def __init__( self, base_url: str, - course_id: str, session: requests.Session, authenticator: IliasAuthenticator, dir_filter: IliasDirectoryFilter @@ -48,7 +47,6 @@ class IliasCrawler: """ self._base_url = base_url - self._course_id = course_id self._session = session self._authenticator = authenticator self.dir_filter = dir_filter @@ -71,17 +69,23 @@ class IliasCrawler: return urlunsplit((scheme, netloc, path, new_query_string, fragment)) - def crawl(self) -> List[IliasDownloadInfo]: - """ - Starts the crawl process, yielding a list of elements to (potentially) download. + def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: """ + Starts the crawl process for a course, yielding a list of elements to (potentially) + download. + Arguments: + course_id {str} -- the course id + + Raises: + FatalException: if an unrecoverable error occurs or the course id is not valid + """ # Start crawling at the given course root_url = self._url_set_query_param( - self._base_url + "/goto.php", "target", f"crs_{self._course_id}" + self._base_url + "/goto.php", "target", f"crs_{course_id}" ) - if not self._is_course_id_valid(root_url): + if not self._is_course_id_valid(root_url, course_id): raise FatalException( "Invalid course id? The URL the server returned did not contain my id." ) @@ -89,9 +93,18 @@ class IliasCrawler: # And treat it as a folder return self._crawl_folder(Path(""), root_url) - def _is_course_id_valid(self, root_url: str) -> bool: + def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: response: requests.Response = self._session.get(root_url) - return self._course_id in response.url + return course_id in response.url + + def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: + """ + Crawls the ILIAS personal desktop (and every subelements that can be reached from there). + + Raises: + FatalException: if an unrecoverable error occurs + """ + return self._crawl_folder(Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI") def _switch_on_crawled_type( self, diff --git a/PFERD/pferd.py b/PFERD/pferd.py index b2e1cb4..79d6e4f 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -4,14 +4,14 @@ Convenience functions for using PFERD. import logging from pathlib import Path -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union from .cookie_jar import CookieJar from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, diva_download_new) from .errors import FatalException, swallow_and_print_errors from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, - IliasDownloader, IliasDownloadStrategy, + IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, KitShibbolethAuthenticator, download_modified_or_new) from .location import Location from .logging import PrettyLogger @@ -56,7 +56,7 @@ class Pferd(Location): self, target: PathLike, base_url: str, - course_id: str, + crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]], authenticator: IliasAuthenticator, cookies: Optional[PathLike], dir_filter: IliasDirectoryFilter, @@ -70,11 +70,11 @@ class Pferd(Location): tmp_dir = self._tmp_dir.new_subdir() organizer = Organizer(self.resolve(to_path(target))) - crawler = IliasCrawler(base_url, course_id, session, authenticator, dir_filter) + crawler = IliasCrawler(base_url, session, authenticator, dir_filter) downloader = IliasDownloader(tmp_dir, organizer, session, authenticator, download_strategy) cookie_jar.load_cookies() - info = crawler.crawl() + info = crawl_function(crawler) cookie_jar.save_cookies() transformed = apply_transform(transform, info) @@ -134,7 +134,58 @@ class Pferd(Location): return self._ilias( target=target, base_url="https://ilias.studium.kit.edu/", - course_id=course_id, + crawl_function=lambda crawler: crawler.crawl_course(course_id), + authenticator=authenticator, + cookies=cookies, + dir_filter=dir_filter, + transform=transform, + download_strategy=download_strategy, + clean=clean, + ) + + @swallow_and_print_errors + def ilias_kit_personal_desktop( + self, + target: PathLike, + dir_filter: IliasDirectoryFilter = lambda x: True, + transform: Transform = lambda x: x, + cookies: Optional[PathLike] = None, + username: Optional[str] = None, + password: Optional[str] = None, + download_strategy: IliasDownloadStrategy = download_modified_or_new, + clean: bool = True, + ) -> Organizer: + """ + Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS + "personal desktop" instead of a single course. + + Arguments: + target {Path} -- the target path to write the data to + + Keyword Arguments: + dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the + crawler level, these directories and all of their content is skipped. + (default: {lambdax:True}) + transform {Transform} -- A transformation function for the output paths. Return None + to ignore a file. (default: {lambdax:x}) + cookies {Optional[Path]} -- The path to store and load cookies from. + (default: {None}) + username {Optional[str]} -- The SCC username. If none is given, it will prompt + the user. (default: {None}) + password {Optional[str]} -- The SCC password. If none is given, it will prompt + the user. (default: {None}) + download_strategy {DownloadStrategy} -- A function to determine which files need to + be downloaded. Can save bandwidth and reduce the number of requests. + (default: {download_modified_or_new}) + clean {bool} -- Whether to clean up when the method finishes. + """ + # This authenticator only works with the KIT ilias instance. + authenticator = KitShibbolethAuthenticator(username=username, password=password) + PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop") + return self._ilias( + target=target, + base_url="https://ilias.studium.kit.edu/", + crawl_function=lambda crawler: crawler.crawl_personal_desktop(), authenticator=authenticator, cookies=cookies, dir_filter=dir_filter,