Allow crawling courses or folders with sync_url

Video folders do not work, if they are passed directly. Their containing folder must be specified instead.
2026-02-20 23:52:23 +01:00 · 2020-09-28 20:00:01 +02:00
parent 74ea039458
commit 51a713fa04
3 changed files with 83 additions and 11 deletions
--- a/PFERD/ilias/crawler.py
+++ b/PFERD/ilias/crawler.py
@@ -116,6 +116,16 @@ class IliasCrawler:

        return urlunsplit((scheme, netloc, path, new_query_string, fragment))

+    def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
+        """
+        Crawls a given url *and all reachable elements in it*.
+
+        Args:
+            url {str} -- the *full* url to crawl
+        """
+        start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url)
+        return self._iterate_entries_to_download_infos(start_entries)
+
    def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
        """
        Starts the crawl process for a course, yielding a list of elements to (potentially)
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -230,6 +230,70 @@ class Pferd(Location):

        return organizer

+    @swallow_and_print_errors
+    def ilias_kit_folder(
+            self,
+            target: PathLike,
+            full_url: str,
+            dir_filter: IliasDirectoryFilter = lambda x, y: True,
+            transform: Transform = lambda x: x,
+            cookies: Optional[PathLike] = None,
+            username: Optional[str] = None,
+            password: Optional[str] = None,
+            download_strategy: IliasDownloadStrategy = download_modified_or_new,
+            clean: bool = True,
+            timeout: int = 5,
+    ) -> Organizer:
+        """
+        Synchronizes a folder with a given folder on the ILIAS instance of the KIT.
+
+        Arguments:
+            target {Path}  -- the target path to write the data to
+            full_url {str} -- the full url of the folder/videos/course to crawl
+
+        Keyword Arguments:
+            dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
+                crawler level, these directories and all of their content is skipped.
+                (default: {lambdax:True})
+            transform {Transform} -- A transformation function for the output paths. Return None
+                to ignore a file. (default: {lambdax:x})
+            cookies {Optional[Path]} -- The path to store and load cookies from.
+                (default: {None})
+            username {Optional[str]} -- The SCC username. If none is given, it will prompt
+                the user. (default: {None})
+            password {Optional[str]} -- The SCC password. If none is given, it will prompt
+                the user. (default: {None})
+            download_strategy {DownloadStrategy} -- A function to determine which files need to
+                be downloaded. Can save bandwidth and reduce the number of requests.
+                (default: {download_modified_or_new})
+            clean {bool} -- Whether to clean up when the method finishes.
+            timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
+                requests bug.
+        """
+        # This authenticator only works with the KIT ilias instance.
+        authenticator = KitShibbolethAuthenticator(username=username, password=password)
+        PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url")
+
+        if not full_url.startswith("https://ilias.studium.kit.edu"):
+            raise FatalException("Not a valid KIT ILIAS URL")
+
+        organizer = self._ilias(
+            target=target,
+            base_url="https://ilias.studium.kit.edu/",
+            crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
+            authenticator=authenticator,
+            cookies=cookies,
+            dir_filter=dir_filter,
+            transform=transform,
+            download_strategy=download_strategy,
+            clean=clean,
+            timeout=timeout
+        )
+
+        self._download_summary.merge(organizer.download_summary)
+
+        return organizer
+
    @swallow_and_print_errors
    def diva_kit(
            self,
--- a/sync_url.py
+++ b/sync_url.py
@@ -23,10 +23,15 @@ def main() -> None:
    parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into")
    args = parser.parse_args()

-    # parse provided course URL
    url = urlparse(args.url)
-    query = parse_qs(url.query)
-    course_id = query['ref_id'][0]
+
+    cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
+    session = cookie_jar.create_session()
+    authenticator = KitShibbolethAuthenticator()
+    crawler = IliasCrawler(url.scheme + '://' + url.netloc, session,
+                           authenticator, lambda x, y: True)
+
+    cookie_jar.load_cookies()

    if args.folder is not None:
        folder = args.folder
@@ -36,13 +41,6 @@ def main() -> None:
        pferd = Pferd(Path(Path(__file__).parent, folder).parent, test_run=args.test_run)
    else:
        # fetch course name from ilias
-        cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
-        session = cookie_jar.create_session()
-        authenticator = KitShibbolethAuthenticator()
-        crawler = IliasCrawler(url.scheme + '://' + url.netloc, session,
-                               authenticator, lambda x, y: True)
-
-        cookie_jar.load_cookies()
        folder = crawler.find_element_name(args.url)
        cookie_jar.save_cookies()

@@ -51,7 +49,7 @@ def main() -> None:

    pferd.enable_logging()
    # fetch
-    pferd.ilias_kit(target=folder, course_id=course_id, cookies=args.cookies)
+    pferd.ilias_kit_folder(target=folder, full_url=args.url, cookies=args.cookies)


 if __name__ == "__main__":