Compare commits

..

20 Commits

Author SHA1 Message Date
3f7c73df80 Release new minor version 2020-10-07 09:32:17 +02:00
43100f69d5 Merge pull request #10 from Garmelon/sync-url
Add "Sync url" script from Christophe and release it automatically
2020-10-07 09:29:48 +02:00
d73c778b0a Add sync_url instructions to README 2020-10-06 17:50:28 +02:00
73c3eb0984 Add option to skip videos in sync_url 2020-10-06 17:20:47 +02:00
a519cbe05d Add sync_url workflow 2020-10-06 12:42:20 +02:00
b3ad9783c4 Ignore pyinstaller files 2020-10-06 11:43:20 +02:00
c1ccb6c53e Allow crawling videos with sync_url 2020-10-06 10:46:06 +02:00
51a713fa04 Allow crawling courses or folders with sync_url
Video folders do not work, if they are passed directly. Their containing
folder must be specified instead.
2020-09-28 20:00:01 +02:00
74ea039458 Fix a few lint errors and pferd quirks in sync_url 2020-09-28 19:42:59 +02:00
aaa6a2b6a4 Merge pull request #9 from TheChristophe/master
Add simple course-download-by-url script
2020-09-28 19:25:45 +02:00
e32a49480b Expose methods to look up course/element names by id / url 2020-09-28 19:16:52 +02:00
be65051f9d Support downloading folders in get-by-url script 2020-09-28 18:16:33 +02:00
3387bc5f20 Add simple course-download-by-url script 2020-09-28 17:49:36 +02:00
3f0ae729d6 Expand "is course" check to not download magazines or other weird things 2020-09-28 16:43:58 +02:00
8e8c1c031a Version 2.3.0 2020-09-03 21:47:10 +02:00
55678d7fee Pass string down to FileCookieJar
Some python versions just can't handle it *despite the documentation
stating they should*.
2020-08-12 09:09:14 +02:00
a57ee8b96b Add timeout to video downloads to work around requests IPv6 bug 2020-08-11 14:40:30 +02:00
e367da925e Bump version to 2.2.1 2020-07-28 19:55:32 +00:00
77a109bb7e Fix ilias shibboleth authenticator
The shibboleth site got a visual overhaul that slightly changed the classes of a
form we need.
2020-07-28 19:13:51 +00:00
a3e1864a26 Allow long paths on windows
If you start PFERD a few folders deep in your home directory, it is
quite easy to reach the maximum path length limit on Windows (260
chars). This patch opts in to long paths ("\\?\" prefix) which lift that
restriction at the cost of ugly path names.
2020-07-25 13:44:49 +02:00
11 changed files with 301 additions and 13 deletions

67
.github/workflows/package.yml vendored Normal file
View File

@ -0,0 +1,67 @@
name: Package Application with Pyinstaller
on:
push:
branches:
- "*"
tags:
- "v*"
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: "Install dependencies"
run: "pip install setuptools pyinstaller rich requests beautifulsoup4 -f --upgrade"
- name: "Install sync_url.py"
run: "pyinstaller sync_url.py -F"
- uses: actions/upload-artifact@v2
with:
name: "Pferd Sync URL"
path: "dist/sync_url*"
release:
name: Release
needs: [build]
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: "Checkout"
uses: actions/checkout@v2
- name: "Download artifacts"
uses: actions/download-artifact@v2
with:
name: "Pferd Sync URL"
- name: "look at folder structure"
run: "ls -lah"
- name: "Create release"
uses: softprops/action-gh-release@v1
- name: "Upload release artifacts"
uses: softprops/action-gh-release@v1
with:
body: "Download sync_url (or sync_url.exe on Windows) and run it in the terminal or CMD."
files: |
sync_url
sync_url.exe

4
.gitignore vendored
View File

@ -8,3 +8,7 @@ build/
.env
.vscode
ilias_cookies.txt
# PyInstaller
sync_url.spec
dist/

View File

@ -22,7 +22,7 @@ class CookieJar:
if cookie_file is None:
self._cookies = LWPCookieJar()
else:
self._cookies = LWPCookieJar(cookie_file)
self._cookies = LWPCookieJar(str(cookie_file.resolve()))
@property
def cookies(self) -> LWPCookieJar:

View File

@ -67,7 +67,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
while not self._login_successful(soup):
# Searching the form here so that this fails before asking for
# credentials rather than after asking.
form = soup.find("form", {"class": "form2", "method": "post"})
form = soup.find("form", {"class": "full content", "method": "post"})
action = form["action"]
# Equivalent: Enter credentials in

View File

@ -116,6 +116,16 @@ class IliasCrawler:
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
"""
Crawls a given url *and all reachable elements in it*.
Args:
url {str} -- the *full* url to crawl
"""
start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url)
return self._iterate_entries_to_download_infos(start_entries)
def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
"""
Starts the crawl process for a course, yielding a list of elements to (potentially)
@ -134,7 +144,7 @@ class IliasCrawler:
if not self._is_course_id_valid(root_url, course_id):
raise FatalException(
"Invalid course id? The URL the server returned did not contain my id."
"Invalid course id? I didn't find anything looking like a course!"
)
# And treat it as a folder
@ -143,7 +153,34 @@ class IliasCrawler:
def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
response: requests.Response = self._session.get(root_url)
return course_id in response.url
# We were redirected ==> Non-existant ID
if course_id not in response.url:
return False
link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link")
if not link_element:
return False
# It wasn't a course but a category list, forum, etc.
return "crs_" in link_element.get("value")
def find_course_name(self, course_id: str) -> Optional[str]:
"""
Returns the name of a given course. None if it is not a valid course
or it could not be found.
"""
course_url = self._url_set_query_param(
self._base_url + "/goto.php", "target", f"crs_{course_id}"
)
return self.find_element_name(course_url)
def find_element_name(self, url: str) -> Optional[str]:
"""
Returns the name of the element at the given URL, if it can find one.
"""
focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus")
if not focus_element:
return None
return focus_element.text
def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
"""
@ -208,6 +245,15 @@ class IliasCrawler:
"""
soup = self._get_page(url, {})
if soup.find(id="headerimage"):
element: bs4.Tag = soup.find(id="headerimage")
if "opencast" in element.attrs["src"].lower():
PRETTY.warning(f"Switched to crawling a video at {folder_path}")
if not self.dir_filter(folder_path, IliasElementType.VIDEO_FOLDER):
PRETTY.not_searching(folder_path, "user filter")
return []
return self._crawl_video_directory(folder_path, url)
result: List[IliasCrawlerEntry] = []
# Fetch all links and throw them to the general interpreter

View File

@ -84,9 +84,13 @@ class IliasDownloader:
session: requests.Session,
authenticator: IliasAuthenticator,
strategy: IliasDownloadStrategy,
timeout: int = 5
):
"""
Create a new IliasDownloader.
The timeout applies to the download request only, as bwcloud uses IPv6
and requests has a problem with that: https://github.com/psf/requests/issues/5522
"""
self._tmp_dir = tmp_dir
@ -94,6 +98,7 @@ class IliasDownloader:
self._session = session
self._authenticator = authenticator
self._strategy = strategy
self._timeout = timeout
def download_all(self, infos: List[IliasDownloadInfo]) -> None:
"""
@ -137,7 +142,7 @@ class IliasDownloader:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
return True
with self._session.get(url, stream=True) as response:
with self._session.get(url, stream=True, timeout=self._timeout) as response:
content_type = response.headers["content-type"]
has_content_disposition = "content-disposition" in response.headers

View File

@ -5,6 +5,7 @@ A organizer is bound to a single directory.
import filecmp
import logging
import os
import shutil
from pathlib import Path, PurePath
from typing import List, Optional, Set
@ -44,6 +45,15 @@ class Organizer(Location):
(e.g. update the timestamp), the path is also returned in this case.
In all other cases (ignored, not overwritten, etc.) this method returns None.
"""
# Windows limits the path length to 260 for *some* historical reason
# If you want longer paths, you will have to add the "\\?\" prefix in front of
# your path...
# See:
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation
if os.name == 'nt':
src_absolute = Path("\\\\?\\" + str(src.resolve()))
dst_absolute = Path("\\\\?\\" + str(self.resolve(dst)))
else:
src_absolute = src.resolve()
dst_absolute = self.resolve(dst)

View File

@ -72,7 +72,8 @@ class Pferd(Location):
dir_filter: IliasDirectoryFilter,
transform: Transform,
download_strategy: IliasDownloadStrategy,
clean: bool = True
timeout: int,
clean: bool = True,
) -> Organizer:
# pylint: disable=too-many-locals
cookie_jar = CookieJar(to_path(cookies) if cookies else None)
@ -81,7 +82,8 @@ class Pferd(Location):
organizer = Organizer(self.resolve(to_path(target)))
crawler = IliasCrawler(base_url, session, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, session, authenticator, download_strategy)
downloader = IliasDownloader(tmp_dir, organizer, session,
authenticator, download_strategy, timeout)
cookie_jar.load_cookies()
info = crawl_function(crawler)
@ -112,6 +114,7 @@ class Pferd(Location):
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
) -> Organizer:
"""
Synchronizes a folder with the ILIAS instance of the KIT.
@ -137,6 +140,8 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
@ -152,6 +157,7 @@ class Pferd(Location):
transform=transform,
download_strategy=download_strategy,
clean=clean,
timeout=timeout
)
self._download_summary.merge(organizer.download_summary)
@ -175,6 +181,7 @@ class Pferd(Location):
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
) -> Organizer:
"""
Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS
@ -199,6 +206,8 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
@ -214,6 +223,71 @@ class Pferd(Location):
transform=transform,
download_strategy=download_strategy,
clean=clean,
timeout=timeout
)
self._download_summary.merge(organizer.download_summary)
return organizer
@swallow_and_print_errors
def ilias_kit_folder(
self,
target: PathLike,
full_url: str,
dir_filter: IliasDirectoryFilter = lambda x, y: True,
transform: Transform = lambda x: x,
cookies: Optional[PathLike] = None,
username: Optional[str] = None,
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
) -> Organizer:
"""
Synchronizes a folder with a given folder on the ILIAS instance of the KIT.
Arguments:
target {Path} -- the target path to write the data to
full_url {str} -- the full url of the folder/videos/course to crawl
Keyword Arguments:
dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
crawler level, these directories and all of their content is skipped.
(default: {lambdax:True})
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
cookies {Optional[Path]} -- The path to store and load cookies from.
(default: {None})
username {Optional[str]} -- The SCC username. If none is given, it will prompt
the user. (default: {None})
password {Optional[str]} -- The SCC password. If none is given, it will prompt
the user. (default: {None})
download_strategy {DownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url")
if not full_url.startswith("https://ilias.studium.kit.edu"):
raise FatalException("Not a valid KIT ILIAS URL")
organizer = self._ilias(
target=target,
base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
authenticator=authenticator,
cookies=cookies,
dir_filter=dir_filter,
transform=transform,
download_strategy=download_strategy,
clean=clean,
timeout=timeout
)
self._download_summary.merge(organizer.download_summary)

View File

@ -2,6 +2,7 @@
**P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien
- [Quickstart with `sync_url`](#quickstart-with-sync_url)
- [Installation](#installation)
- [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210)
- [Example setup](#example-setup)
@ -12,6 +13,20 @@
- [Transform combinators](#transform-combinators)
- [A short, but commented example](#a-short-but-commented-example)
## Quickstart with `sync_url`
The `sync_url` program allows you to just synchronize a given ILIAS URL (of a
course, a folder, your personal desktop, etc.) without any extra configuration
or setting up. Download the program, open ILIAS, copy the URL from the address
bar and pass it to sync_url.
It bundles everything it needs in one executable and is easy to
use, but doesn't expose all the configuration options and tweaks a full install
does.
1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest).
2. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it
## Installation
Ensure that you have at least Python 3.8 installed.
@ -19,7 +34,7 @@ Ensure that you have at least Python 3.8 installed.
To install PFERD or update your installation to the latest version, run this
wherever you want to install or have already installed PFERD:
```
$ pip install git+https://github.com/Garmelon/PFERD@v2.2.0
$ pip install git+https://github.com/Garmelon/PFERD@v2.4.0
```
The use of [venv] is recommended.
@ -42,8 +57,8 @@ $ mkdir Vorlesungen
$ cd Vorlesungen
$ python3 -m venv .venv
$ .venv/bin/activate
$ pip install git+https://github.com/Garmelon/PFERD@v2.2.0
$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.2.0/example_config.py
$ pip install git+https://github.com/Garmelon/PFERD@v2.4.0
$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.0/example_config.py
$ python3 example_config.py
$ deactivate
```

View File

@ -2,7 +2,7 @@ from setuptools import find_packages, setup
setup(
name="PFERD",
version="2.2.0",
version="2.4.0",
packages=find_packages(),
install_requires=[
"requests>=2.21.0",

67
sync_url.py Executable file
View File

@ -0,0 +1,67 @@
#!/usr/bin/env python
"""
A simple script to download a course by name from ILIAS.
"""
import argparse
from pathlib import Path
from urllib.parse import urlparse
from PFERD import Pferd
from PFERD.cookie_jar import CookieJar
from PFERD.ilias import (IliasCrawler, IliasElementType,
KitShibbolethAuthenticator)
from PFERD.utils import to_path
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--test-run", action="store_true")
parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in")
parser.add_argument('-f', '--no-videos', nargs='?', default=None, help="Don't download videos")
parser.add_argument('url', help="URL to the course page")
parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into")
args = parser.parse_args()
url = urlparse(args.url)
cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
session = cookie_jar.create_session()
authenticator = KitShibbolethAuthenticator()
crawler = IliasCrawler(url.scheme + '://' + url.netloc, session,
authenticator, lambda x, y: True)
cookie_jar.load_cookies()
if args.folder is not None:
folder = args.folder
# Initialize pferd at the *parent of the passed folder*
# This is needed so Pferd's internal protections against escaping the working directory
# do not trigger (e.g. if somebody names a file in ILIAS '../../bad thing.txt')
pferd = Pferd(Path(Path(__file__).parent, folder).parent, test_run=args.test_run)
else:
# fetch course name from ilias
folder = crawler.find_element_name(args.url)
cookie_jar.save_cookies()
# Initialize pferd at the location of the script
pferd = Pferd(Path(__file__).parent, test_run=args.test_run)
def dir_filter(_: Path, element: IliasElementType) -> bool:
if args.no_videos:
return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER]
return True
pferd.enable_logging()
# fetch
pferd.ilias_kit_folder(
target=folder,
full_url=args.url,
cookies=args.cookies,
dir_filter=dir_filter
)
if __name__ == "__main__":
main()