Allow crawling the ILIAS Personal Desktop

This commit is contained in:
I-Al-Istannen 2020-05-10 12:16:42 +02:00
parent 9950144e97
commit 9850ab1d73
2 changed files with 79 additions and 15 deletions

View File

@ -38,7 +38,6 @@ class IliasCrawler:
def __init__( def __init__(
self, self,
base_url: str, base_url: str,
course_id: str,
session: requests.Session, session: requests.Session,
authenticator: IliasAuthenticator, authenticator: IliasAuthenticator,
dir_filter: IliasDirectoryFilter dir_filter: IliasDirectoryFilter
@ -48,7 +47,6 @@ class IliasCrawler:
""" """
self._base_url = base_url self._base_url = base_url
self._course_id = course_id
self._session = session self._session = session
self._authenticator = authenticator self._authenticator = authenticator
self.dir_filter = dir_filter self.dir_filter = dir_filter
@ -71,17 +69,23 @@ class IliasCrawler:
return urlunsplit((scheme, netloc, path, new_query_string, fragment)) return urlunsplit((scheme, netloc, path, new_query_string, fragment))
def crawl(self) -> List[IliasDownloadInfo]: def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
"""
Starts the crawl process, yielding a list of elements to (potentially) download.
""" """
Starts the crawl process for a course, yielding a list of elements to (potentially)
download.
Arguments:
course_id {str} -- the course id
Raises:
FatalException: if an unrecoverable error occurs or the course id is not valid
"""
# Start crawling at the given course # Start crawling at the given course
root_url = self._url_set_query_param( root_url = self._url_set_query_param(
self._base_url + "/goto.php", "target", f"crs_{self._course_id}" self._base_url + "/goto.php", "target", f"crs_{course_id}"
) )
if not self._is_course_id_valid(root_url): if not self._is_course_id_valid(root_url, course_id):
raise FatalException( raise FatalException(
"Invalid course id? The URL the server returned did not contain my id." "Invalid course id? The URL the server returned did not contain my id."
) )
@ -89,9 +93,18 @@ class IliasCrawler:
# And treat it as a folder # And treat it as a folder
return self._crawl_folder(Path(""), root_url) return self._crawl_folder(Path(""), root_url)
def _is_course_id_valid(self, root_url: str) -> bool: def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
response: requests.Response = self._session.get(root_url) response: requests.Response = self._session.get(root_url)
return self._course_id in response.url return course_id in response.url
def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
"""
Crawls the ILIAS personal desktop (and every subelements that can be reached from there).
Raises:
FatalException: if an unrecoverable error occurs
"""
return self._crawl_folder(Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI")
def _switch_on_crawled_type( def _switch_on_crawled_type(
self, self,

View File

@ -4,14 +4,14 @@ Convenience functions for using PFERD.
import logging import logging
from pathlib import Path from pathlib import Path
from typing import List, Optional, Union from typing import Callable, List, Optional, Union
from .cookie_jar import CookieJar from .cookie_jar import CookieJar
from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler,
diva_download_new) diva_download_new)
from .errors import FatalException, swallow_and_print_errors from .errors import FatalException, swallow_and_print_errors
from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter,
IliasDownloader, IliasDownloadStrategy, IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy,
KitShibbolethAuthenticator, download_modified_or_new) KitShibbolethAuthenticator, download_modified_or_new)
from .location import Location from .location import Location
from .logging import PrettyLogger from .logging import PrettyLogger
@ -56,7 +56,7 @@ class Pferd(Location):
self, self,
target: PathLike, target: PathLike,
base_url: str, base_url: str,
course_id: str, crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]],
authenticator: IliasAuthenticator, authenticator: IliasAuthenticator,
cookies: Optional[PathLike], cookies: Optional[PathLike],
dir_filter: IliasDirectoryFilter, dir_filter: IliasDirectoryFilter,
@ -70,11 +70,11 @@ class Pferd(Location):
tmp_dir = self._tmp_dir.new_subdir() tmp_dir = self._tmp_dir.new_subdir()
organizer = Organizer(self.resolve(to_path(target))) organizer = Organizer(self.resolve(to_path(target)))
crawler = IliasCrawler(base_url, course_id, session, authenticator, dir_filter) crawler = IliasCrawler(base_url, session, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, session, authenticator, download_strategy) downloader = IliasDownloader(tmp_dir, organizer, session, authenticator, download_strategy)
cookie_jar.load_cookies() cookie_jar.load_cookies()
info = crawler.crawl() info = crawl_function(crawler)
cookie_jar.save_cookies() cookie_jar.save_cookies()
transformed = apply_transform(transform, info) transformed = apply_transform(transform, info)
@ -134,7 +134,58 @@ class Pferd(Location):
return self._ilias( return self._ilias(
target=target, target=target,
base_url="https://ilias.studium.kit.edu/", base_url="https://ilias.studium.kit.edu/",
course_id=course_id, crawl_function=lambda crawler: crawler.crawl_course(course_id),
authenticator=authenticator,
cookies=cookies,
dir_filter=dir_filter,
transform=transform,
download_strategy=download_strategy,
clean=clean,
)
@swallow_and_print_errors
def ilias_kit_personal_desktop(
self,
target: PathLike,
dir_filter: IliasDirectoryFilter = lambda x: True,
transform: Transform = lambda x: x,
cookies: Optional[PathLike] = None,
username: Optional[str] = None,
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
) -> Organizer:
"""
Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS
"personal desktop" instead of a single course.
Arguments:
target {Path} -- the target path to write the data to
Keyword Arguments:
dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
crawler level, these directories and all of their content is skipped.
(default: {lambdax:True})
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
cookies {Optional[Path]} -- The path to store and load cookies from.
(default: {None})
username {Optional[str]} -- The SCC username. If none is given, it will prompt
the user. (default: {None})
password {Optional[str]} -- The SCC password. If none is given, it will prompt
the user. (default: {None})
download_strategy {DownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop")
return self._ilias(
target=target,
base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.crawl_personal_desktop(),
authenticator=authenticator, authenticator=authenticator,
cookies=cookies, cookies=cookies,
dir_filter=dir_filter, dir_filter=dir_filter,