Compare commits

...

43 Commits

Author SHA1 Message Date
ba9215ebe8 Bump version 2020-11-18 10:09:45 +01:00
8ebf0eab16 Sort download summary 2020-11-17 21:36:04 +01:00
cd90a60dee Move "sanitize_windows_path" to PFERD.transform 2020-11-12 20:52:46 +01:00
98834c9c95 Bump version 2020-11-12 20:23:36 +01:00
55e9e719ad Sanitize "/" in ilias path names 2020-11-12 20:21:24 +01:00
a0ae9aee27 Sanitize individual path parts 2020-11-11 09:36:20 +01:00
1486a63854 Do not collapse directory structure when sanitizing 2020-11-10 22:53:47 +01:00
733e1ae136 Bump version 2020-11-10 20:50:31 +01:00
4ac51048c1 Use "_" as a replacement for illegal characters 2020-11-10 20:49:14 +01:00
f2aba970fd [sync_url] Sanitize path names on windows 2020-11-10 17:16:14 +01:00
9c4759103a Bump patch version 2020-11-05 11:25:06 +01:00
316b9d7bf4 Prevent too many retries when fetching an ILIAS page 2020-11-04 22:23:56 +01:00
6f30adcd22 Fix quote type in README 2020-11-04 22:13:08 +01:00
6f78fef604 Add quoting instructions to README 2020-11-04 22:08:33 +01:00
f830b42a36 Fix duplicate files in download summary 2020-11-04 21:49:35 +01:00
ef343dec7c Merge organizer download summaries 2020-11-04 15:06:58 +01:00
0da2fafcd8 Fix links outside tables 2020-11-04 14:46:15 +01:00
f4abe3197c Add ipd crawler 2020-11-03 21:15:40 +01:00
38d4f5b4c9 Do not fail only empty courses 2020-11-03 20:09:54 +01:00
9ea03bda3e Adjust release names 2020-10-30 18:14:02 +01:00
07de5bea8b Explain how to run sync_url on Mac 2020-10-30 17:53:55 +01:00
f0d572c110 Fix a few typos in release body 2020-10-30 17:32:04 +01:00
076067e22d Bump version 2020-10-30 17:28:34 +01:00
ebb6e63c5c Add MacOS to CI 2020-10-30 17:23:27 +01:00
0c3f35a2d2 Do not provide a shorthand for "no-videos" 2020-10-30 17:01:10 +01:00
521890ae78 Update README.md 2020-10-28 23:24:18 +01:00
3f7c73df80 Release new minor version 2020-10-07 09:32:17 +02:00
43100f69d5 Merge pull request #10 from Garmelon/sync-url
Add "Sync url" script from Christophe and release it automatically
2020-10-07 09:29:48 +02:00
d73c778b0a Add sync_url instructions to README 2020-10-06 17:50:28 +02:00
73c3eb0984 Add option to skip videos in sync_url 2020-10-06 17:20:47 +02:00
a519cbe05d Add sync_url workflow 2020-10-06 12:42:20 +02:00
b3ad9783c4 Ignore pyinstaller files 2020-10-06 11:43:20 +02:00
c1ccb6c53e Allow crawling videos with sync_url 2020-10-06 10:46:06 +02:00
51a713fa04 Allow crawling courses or folders with sync_url
Video folders do not work, if they are passed directly. Their containing
folder must be specified instead.
2020-09-28 20:00:01 +02:00
74ea039458 Fix a few lint errors and pferd quirks in sync_url 2020-09-28 19:42:59 +02:00
aaa6a2b6a4 Merge pull request #9 from TheChristophe/master
Add simple course-download-by-url script
2020-09-28 19:25:45 +02:00
e32a49480b Expose methods to look up course/element names by id / url 2020-09-28 19:16:52 +02:00
be65051f9d Support downloading folders in get-by-url script 2020-09-28 18:16:33 +02:00
3387bc5f20 Add simple course-download-by-url script 2020-09-28 17:49:36 +02:00
3f0ae729d6 Expand "is course" check to not download magazines or other weird things 2020-09-28 16:43:58 +02:00
8e8c1c031a Version 2.3.0 2020-09-03 21:47:10 +02:00
55678d7fee Pass string down to FileCookieJar
Some python versions just can't handle it *despite the documentation
stating they should*.
2020-08-12 09:09:14 +02:00
a57ee8b96b Add timeout to video downloads to work around requests IPv6 bug 2020-08-11 14:40:30 +02:00
13 changed files with 553 additions and 19 deletions

74
.github/workflows/package.yml vendored Normal file
View File

@ -0,0 +1,74 @@
name: Package Application with Pyinstaller
on:
push:
branches:
- "*"
tags:
- "v*"
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: "Install dependencies"
run: "pip install setuptools pyinstaller rich requests beautifulsoup4 -f --upgrade"
- name: "Install sync_url.py"
run: "pyinstaller sync_url.py -F"
- name: "Move artifact"
run: "mv dist/sync_url* dist/sync_url-${{ matrix.os }}"
- uses: actions/upload-artifact@v2
with:
name: "Pferd Sync URL"
path: "dist/sync_url*"
release:
name: Release
needs: [build]
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: "Checkout"
uses: actions/checkout@v2
- name: "Download artifacts"
uses: actions/download-artifact@v2
with:
name: "Pferd Sync URL"
- name: "look at folder structure"
run: "ls -lah"
- name: "Rename releases"
run: "mv sync_url-macos-latest pferd_sync_url_mac && mv sync_url-ubuntu-latest pferd_sync_url_linux && mv sync_url-windows-latest pferd_sync_url.exe"
- name: "Create release"
uses: softprops/action-gh-release@v1
- name: "Upload release artifacts"
uses: softprops/action-gh-release@v1
with:
body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x <file>`. Also please enclose the *url you pass to the program in double quotes* or your shell might silently screw it up!"
files: |
pferd_sync_url_mac
pferd_sync_url_linux
pferd_sync_url.exe

4
.gitignore vendored
View File

@ -8,3 +8,7 @@ build/
.env .env
.vscode .vscode
ilias_cookies.txt ilias_cookies.txt
# PyInstaller
sync_url.spec
dist/

View File

@ -22,7 +22,7 @@ class CookieJar:
if cookie_file is None: if cookie_file is None:
self._cookies = LWPCookieJar() self._cookies = LWPCookieJar()
else: else:
self._cookies = LWPCookieJar(cookie_file) self._cookies = LWPCookieJar(str(cookie_file.resolve()))
@property @property
def cookies(self) -> LWPCookieJar: def cookies(self) -> LWPCookieJar:

View File

@ -5,6 +5,12 @@ from pathlib import Path
from typing import List from typing import List
def _mergeNoDuplicate(first: List[Path], second: List[Path]) -> List[Path]:
tmp = list(set(first + second))
tmp.sort(key=lambda x: str(x.resolve()))
return tmp
class DownloadSummary: class DownloadSummary:
""" """
Keeps track of all new, modified or deleted files and provides a summary. Keeps track of all new, modified or deleted files and provides a summary.
@ -40,9 +46,9 @@ class DownloadSummary:
""" """
Merges ourselves with the passed summary. Modifies this object, but not the passed one. Merges ourselves with the passed summary. Modifies this object, but not the passed one.
""" """
self._new_files += summary.new_files self._new_files = _mergeNoDuplicate(self._new_files, summary.new_files)
self._modified_files += summary.modified_files self._modified_files = _mergeNoDuplicate(self._modified_files, summary.modified_files)
self._deleted_files += summary.deleted_files self._deleted_files = _mergeNoDuplicate(self._deleted_files, summary.deleted_files)
def add_deleted_file(self, path: Path) -> None: def add_deleted_file(self, path: Path) -> None:
""" """

View File

@ -26,6 +26,10 @@ LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER) PRETTY = PrettyLogger(LOGGER)
def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-")
class IliasElementType(Enum): class IliasElementType(Enum):
""" """
The type of an ilias element. The type of an ilias element.
@ -116,6 +120,16 @@ class IliasCrawler:
return urlunsplit((scheme, netloc, path, new_query_string, fragment)) return urlunsplit((scheme, netloc, path, new_query_string, fragment))
def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
"""
Crawls a given url *and all reachable elements in it*.
Args:
url {str} -- the *full* url to crawl
"""
start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url)
return self._iterate_entries_to_download_infos(start_entries)
def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
""" """
Starts the crawl process for a course, yielding a list of elements to (potentially) Starts the crawl process for a course, yielding a list of elements to (potentially)
@ -134,7 +148,7 @@ class IliasCrawler:
if not self._is_course_id_valid(root_url, course_id): if not self._is_course_id_valid(root_url, course_id):
raise FatalException( raise FatalException(
"Invalid course id? The URL the server returned did not contain my id." "Invalid course id? I didn't find anything looking like a course!"
) )
# And treat it as a folder # And treat it as a folder
@ -143,7 +157,34 @@ class IliasCrawler:
def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
response: requests.Response = self._session.get(root_url) response: requests.Response = self._session.get(root_url)
return course_id in response.url # We were redirected ==> Non-existant ID
if course_id not in response.url:
return False
link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link")
if not link_element:
return False
# It wasn't a course but a category list, forum, etc.
return "crs_" in link_element.get("value")
def find_course_name(self, course_id: str) -> Optional[str]:
"""
Returns the name of a given course. None if it is not a valid course
or it could not be found.
"""
course_url = self._url_set_query_param(
self._base_url + "/goto.php", "target", f"crs_{course_id}"
)
return self.find_element_name(course_url)
def find_element_name(self, url: str) -> Optional[str]:
"""
Returns the name of the element at the given URL, if it can find one.
"""
focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus")
if not focus_element:
return None
return focus_element.text
def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
""" """
@ -208,13 +249,22 @@ class IliasCrawler:
""" """
soup = self._get_page(url, {}) soup = self._get_page(url, {})
if soup.find(id="headerimage"):
element: bs4.Tag = soup.find(id="headerimage")
if "opencast" in element.attrs["src"].lower():
PRETTY.warning(f"Switched to crawling a video at {folder_path}")
if not self.dir_filter(folder_path, IliasElementType.VIDEO_FOLDER):
PRETTY.not_searching(folder_path, "user filter")
return []
return self._crawl_video_directory(folder_path, url)
result: List[IliasCrawlerEntry] = [] result: List[IliasCrawlerEntry] = []
# Fetch all links and throw them to the general interpreter # Fetch all links and throw them to the general interpreter
links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle")
for link in links: for link in links:
abs_url = self._abs_url_from_link(link) abs_url = self._abs_url_from_link(link)
element_path = Path(folder_path, link.getText().strip()) element_path = Path(folder_path, _sanitize_path_name(link.getText().strip()))
element_type = self._find_type_from_link(element_path, link, abs_url) element_type = self._find_type_from_link(element_path, link, abs_url)
if element_type == IliasElementType.REGULAR_FILE: if element_type == IliasElementType.REGULAR_FILE:
@ -331,7 +381,7 @@ class IliasCrawler:
modification_date = demangle_date(modification_date_str) modification_date = demangle_date(modification_date_str)
# Grab the name from the link text # Grab the name from the link text
name = link_element.getText() name = _sanitize_path_name(link_element.getText())
full_path = Path(path, name + "." + file_type) full_path = Path(path, name + "." + file_type)
return [ return [
@ -462,7 +512,7 @@ class IliasCrawler:
).getText().strip() ).getText().strip()
title += ".mp4" title += ".mp4"
video_path: Path = Path(parent_path, title) video_path: Path = Path(parent_path, _sanitize_path_name(title))
video_url = self._abs_url_from_link(link) video_url = self._abs_url_from_link(link)
@ -534,6 +584,7 @@ class IliasCrawler:
# Two divs, side by side. Left is the name, right is the link ==> get left # Two divs, side by side. Left is the name, right is the link ==> get left
# sibling # sibling
file_name = file_link.parent.findPrevious(name="div").getText().strip() file_name = file_link.parent.findPrevious(name="div").getText().strip()
file_name = _sanitize_path_name(file_name)
url = self._abs_url_from_link(file_link) url = self._abs_url_from_link(file_link)
LOGGER.debug("Found file %r at %r", file_name, url) LOGGER.debug("Found file %r at %r", file_name, url)
@ -547,10 +598,17 @@ class IliasCrawler:
return results return results
def _get_page(self, url: str, params: Dict[str, Any]) -> bs4.BeautifulSoup: def _get_page(self, url: str, params: Dict[str, Any],
retry_count: int = 0) -> bs4.BeautifulSoup:
""" """
Fetches a page from ILIAS, authenticating when needed. Fetches a page from ILIAS, authenticating when needed.
""" """
if retry_count >= 4:
raise FatalException("Could not get a proper page after 4 tries. "
"Maybe your URL is wrong, authentication fails continuously, "
"your ILIAS connection is spotty or ILIAS is not well.")
LOGGER.debug("Fetching %r", url) LOGGER.debug("Fetching %r", url)
response = self._session.get(url, params=params) response = self._session.get(url, params=params)
@ -571,7 +629,7 @@ class IliasCrawler:
self._authenticator.authenticate(self._session) self._authenticator.authenticate(self._session)
return self._get_page(url, params) return self._get_page(url, params, retry_count + 1)
@staticmethod @staticmethod
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:

View File

@ -84,9 +84,13 @@ class IliasDownloader:
session: requests.Session, session: requests.Session,
authenticator: IliasAuthenticator, authenticator: IliasAuthenticator,
strategy: IliasDownloadStrategy, strategy: IliasDownloadStrategy,
timeout: int = 5
): ):
""" """
Create a new IliasDownloader. Create a new IliasDownloader.
The timeout applies to the download request only, as bwcloud uses IPv6
and requests has a problem with that: https://github.com/psf/requests/issues/5522
""" """
self._tmp_dir = tmp_dir self._tmp_dir = tmp_dir
@ -94,6 +98,7 @@ class IliasDownloader:
self._session = session self._session = session
self._authenticator = authenticator self._authenticator = authenticator
self._strategy = strategy self._strategy = strategy
self._timeout = timeout
def download_all(self, infos: List[IliasDownloadInfo]) -> None: def download_all(self, infos: List[IliasDownloadInfo]) -> None:
""" """
@ -137,7 +142,7 @@ class IliasDownloader:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
return True return True
with self._session.get(url, stream=True) as response: with self._session.get(url, stream=True, timeout=self._timeout) as response:
content_type = response.headers["content-type"] content_type = response.headers["content-type"]
has_content_disposition = "content-disposition" in response.headers has_content_disposition = "content-disposition" in response.headers

151
PFERD/ipd.py Normal file
View File

@ -0,0 +1,151 @@
"""
Utility functions and a scraper/downloader for the IPD pages.
"""
import datetime
import logging
import math
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, List, Optional
from urllib.parse import urljoin
import bs4
import requests
from PFERD.errors import FatalException
from PFERD.utils import soupify
from .logging import PrettyLogger
from .organizer import Organizer
from .tmp_dir import TmpDir
from .transform import Transformable
from .utils import stream_to_path
LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)
@dataclass
class IpdDownloadInfo(Transformable):
"""
Information about an ipd entry.
"""
url: str
modification_date: Optional[datetime.datetime]
IpdDownloadStrategy = Callable[[Organizer, IpdDownloadInfo], bool]
def ipd_download_new_or_modified(organizer: Organizer, info: IpdDownloadInfo) -> bool:
"""
Accepts new files or files with a more recent modification date.
"""
resolved_file = organizer.resolve(info.path)
if not resolved_file.exists():
return True
if not info.modification_date:
PRETTY.ignored_file(info.path, "could not find modification time, file exists")
return False
resolved_mod_time_seconds = resolved_file.stat().st_mtime
# Download if the info is newer
if info.modification_date.timestamp() > resolved_mod_time_seconds:
return True
PRETTY.ignored_file(info.path, "local file has newer or equal modification time")
return False
class IpdCrawler:
# pylint: disable=too-few-public-methods
"""
A crawler for IPD pages.
"""
def __init__(self, base_url: str):
self._base_url = base_url
def _abs_url_from_link(self, link_tag: bs4.Tag) -> str:
"""
Create an absolute url from an <a> tag.
"""
return urljoin(self._base_url, link_tag.get("href"))
def crawl(self) -> List[IpdDownloadInfo]:
"""
Crawls the playlist given in the constructor.
"""
page = soupify(requests.get(self._base_url))
items: List[IpdDownloadInfo] = []
for link in page.findAll(name="a", attrs={"href": lambda x: x and x.endswith("pdf")}):
href: str = link.attrs.get("href")
name = href.split("/")[-1]
modification_date: Optional[datetime.datetime] = None
try:
enclosing_row: bs4.Tag = link.findParent(name="tr")
if enclosing_row:
date_text = enclosing_row.find(name="td").text
modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y")
except ValueError:
modification_date = None
items.append(IpdDownloadInfo(
Path(name),
url=self._abs_url_from_link(link),
modification_date=modification_date
))
return items
class IpdDownloader:
"""
A downloader for ipd files.
"""
def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy):
self._tmp_dir = tmp_dir
self._organizer = organizer
self._strategy = strategy
self._session = requests.session()
def download_all(self, infos: List[IpdDownloadInfo]) -> None:
"""
Download multiple files one after the other.
"""
for info in infos:
self.download(info)
def download(self, info: IpdDownloadInfo) -> None:
"""
Download a single file.
"""
if not self._strategy(self._organizer, info):
self._organizer.mark(info.path)
return
with self._session.get(info.url, stream=True) as response:
if response.status_code == 200:
tmp_file = self._tmp_dir.new_path()
stream_to_path(response, tmp_file, info.path.name)
dst_path = self._organizer.accept_file(tmp_file, info.path)
if dst_path and info.modification_date:
os.utime(
dst_path,
times=(
math.ceil(info.modification_date.timestamp()),
math.ceil(info.modification_date.timestamp())
)
)
elif response.status_code == 403:
raise FatalException("Received 403. Are you not using the KIT VPN?")
else:
PRETTY.warning(f"Could not download file, got response {response.status_code}")

View File

@ -124,6 +124,8 @@ class Organizer(Location):
self._cleanup(self.path) self._cleanup(self.path)
def _cleanup(self, start_dir: Path) -> None: def _cleanup(self, start_dir: Path) -> None:
if not start_dir.exists():
return
paths: List[Path] = list(start_dir.iterdir()) paths: List[Path] = list(start_dir.iterdir())
# Recursively clean paths # Recursively clean paths

View File

@ -14,6 +14,8 @@ from .errors import FatalException, swallow_and_print_errors
from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter,
IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy,
KitShibbolethAuthenticator, download_modified_or_new) KitShibbolethAuthenticator, download_modified_or_new)
from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo,
IpdDownloadStrategy, ipd_download_new_or_modified)
from .location import Location from .location import Location
from .logging import PrettyLogger, enable_logging from .logging import PrettyLogger, enable_logging
from .organizer import Organizer from .organizer import Organizer
@ -72,7 +74,8 @@ class Pferd(Location):
dir_filter: IliasDirectoryFilter, dir_filter: IliasDirectoryFilter,
transform: Transform, transform: Transform,
download_strategy: IliasDownloadStrategy, download_strategy: IliasDownloadStrategy,
clean: bool = True timeout: int,
clean: bool = True,
) -> Organizer: ) -> Organizer:
# pylint: disable=too-many-locals # pylint: disable=too-many-locals
cookie_jar = CookieJar(to_path(cookies) if cookies else None) cookie_jar = CookieJar(to_path(cookies) if cookies else None)
@ -81,7 +84,8 @@ class Pferd(Location):
organizer = Organizer(self.resolve(to_path(target))) organizer = Organizer(self.resolve(to_path(target)))
crawler = IliasCrawler(base_url, session, authenticator, dir_filter) crawler = IliasCrawler(base_url, session, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, session, authenticator, download_strategy) downloader = IliasDownloader(tmp_dir, organizer, session,
authenticator, download_strategy, timeout)
cookie_jar.load_cookies() cookie_jar.load_cookies()
info = crawl_function(crawler) info = crawl_function(crawler)
@ -112,6 +116,7 @@ class Pferd(Location):
password: Optional[str] = None, password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new, download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True, clean: bool = True,
timeout: int = 5,
) -> Organizer: ) -> Organizer:
""" """
Synchronizes a folder with the ILIAS instance of the KIT. Synchronizes a folder with the ILIAS instance of the KIT.
@ -137,6 +142,8 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests. be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new}) (default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes. clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
""" """
# This authenticator only works with the KIT ilias instance. # This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password) authenticator = KitShibbolethAuthenticator(username=username, password=password)
@ -152,6 +159,7 @@ class Pferd(Location):
transform=transform, transform=transform,
download_strategy=download_strategy, download_strategy=download_strategy,
clean=clean, clean=clean,
timeout=timeout
) )
self._download_summary.merge(organizer.download_summary) self._download_summary.merge(organizer.download_summary)
@ -175,6 +183,7 @@ class Pferd(Location):
password: Optional[str] = None, password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new, download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True, clean: bool = True,
timeout: int = 5,
) -> Organizer: ) -> Organizer:
""" """
Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS
@ -199,6 +208,8 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests. be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new}) (default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes. clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
""" """
# This authenticator only works with the KIT ilias instance. # This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password) authenticator = KitShibbolethAuthenticator(username=username, password=password)
@ -214,12 +225,131 @@ class Pferd(Location):
transform=transform, transform=transform,
download_strategy=download_strategy, download_strategy=download_strategy,
clean=clean, clean=clean,
timeout=timeout
) )
self._download_summary.merge(organizer.download_summary) self._download_summary.merge(organizer.download_summary)
return organizer return organizer
@swallow_and_print_errors
def ilias_kit_folder(
self,
target: PathLike,
full_url: str,
dir_filter: IliasDirectoryFilter = lambda x, y: True,
transform: Transform = lambda x: x,
cookies: Optional[PathLike] = None,
username: Optional[str] = None,
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
) -> Organizer:
"""
Synchronizes a folder with a given folder on the ILIAS instance of the KIT.
Arguments:
target {Path} -- the target path to write the data to
full_url {str} -- the full url of the folder/videos/course to crawl
Keyword Arguments:
dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
crawler level, these directories and all of their content is skipped.
(default: {lambdax:True})
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
cookies {Optional[Path]} -- The path to store and load cookies from.
(default: {None})
username {Optional[str]} -- The SCC username. If none is given, it will prompt
the user. (default: {None})
password {Optional[str]} -- The SCC password. If none is given, it will prompt
the user. (default: {None})
download_strategy {DownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url")
if not full_url.startswith("https://ilias.studium.kit.edu"):
raise FatalException("Not a valid KIT ILIAS URL")
organizer = self._ilias(
target=target,
base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
authenticator=authenticator,
cookies=cookies,
dir_filter=dir_filter,
transform=transform,
download_strategy=download_strategy,
clean=clean,
timeout=timeout
)
self._download_summary.merge(organizer.download_summary)
return organizer
@swallow_and_print_errors
def ipd_kit(
self,
target: Union[PathLike, Organizer],
url: str,
transform: Transform = lambda x: x,
download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified,
clean: bool = True
) -> Organizer:
"""
Synchronizes a folder with a DIVA playlist.
Arguments:
target {Union[PathLike, Organizer]} -- The organizer / target folder to use.
url {str} -- the url to the page
Keyword Arguments:
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
download_strategy {DivaDownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {diva_download_new})
clean {bool} -- Whether to clean up when the method finishes.
"""
tmp_dir = self._tmp_dir.new_subdir()
if target is None:
PRETTY.starting_synchronizer("None", "IPD", url)
raise FatalException("Got 'None' as target directory, aborting")
if isinstance(target, Organizer):
organizer = target
else:
organizer = Organizer(self.resolve(to_path(target)))
PRETTY.starting_synchronizer(organizer.path, "IPD", url)
elements: List[IpdDownloadInfo] = IpdCrawler(url).crawl()
transformed = apply_transform(transform, elements)
if self._test_run:
self._print_transformables(transformed)
return organizer
downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy)
downloader.download_all(transformed)
if clean:
organizer.cleanup()
self._download_summary.merge(organizer.download_summary)
return organizer
@swallow_and_print_errors @swallow_and_print_errors
def diva_kit( def diva_kit(
self, self,
@ -278,4 +408,6 @@ class Pferd(Location):
if clean: if clean:
organizer.cleanup() organizer.cleanup()
self._download_summary.merge(organizer.download_summary)
return organizer return organizer

View File

@ -5,6 +5,8 @@ only files whose names match a regex, or renaming files from one numbering
scheme to another. scheme to another.
""" """
import os
import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from typing import Callable, List, Optional, TypeVar from typing import Callable, List, Optional, TypeVar
@ -45,7 +47,8 @@ def apply_transform(
# Transform combinators # Transform combinators
keep = lambda path: path def keep(path: PurePath) -> Optional[PurePath]:
return path
def attempt(*args: Transform) -> Transform: def attempt(*args: Transform) -> Transform:
def inner(path: PurePath) -> Optional[PurePath]: def inner(path: PurePath) -> Optional[PurePath]:
@ -125,3 +128,15 @@ def re_rename(regex: Regex, target: str) -> Transform:
return path.with_name(target.format(*groups)) return path.with_name(target.format(*groups))
return None return None
return inner return inner
def sanitize_windows_path(path: PurePath) -> Optional[PurePath]:
"""
A small function to escape characters that are forbidden in windows path names.
This method is a no-op on other operating systems.
"""
# Escape windows illegal path characters
if os.name == 'nt':
sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)]
return PurePath(*sanitized_parts)
return path

View File

@ -2,6 +2,7 @@
**P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien
- [Quickstart with `sync_url`](#quickstart-with-sync_url)
- [Installation](#installation) - [Installation](#installation)
- [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210) - [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210)
- [Example setup](#example-setup) - [Example setup](#example-setup)
@ -12,6 +13,23 @@
- [Transform combinators](#transform-combinators) - [Transform combinators](#transform-combinators)
- [A short, but commented example](#a-short-but-commented-example) - [A short, but commented example](#a-short-but-commented-example)
## Quickstart with `sync_url`
The `sync_url` program allows you to just synchronize a given ILIAS URL (of a
course, a folder, your personal desktop, etc.) without any extra configuration
or setting up. Download the program, open ILIAS, copy the URL from the address
bar and pass it to sync_url.
It bundles everything it needs in one executable and is easy to
use, but doesn't expose all the configuration options and tweaks a full install
does.
1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest).
2. Recognize that you most likely need to enclose the URL in `""` quotes to prevent your shell from interpreting `&` and other symbols
3. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option.
If you are on **Linux/Mac**, you need to *make the file executable* using `chmod +x <file>`.
If you are on **Mac**, you need to allow this unverified program to run (see e.g. [here](https://www.switchingtomac.com/tutorials/osx/how-to-run-unverified-apps-on-macos/))
## Installation ## Installation
Ensure that you have at least Python 3.8 installed. Ensure that you have at least Python 3.8 installed.
@ -19,7 +37,7 @@ Ensure that you have at least Python 3.8 installed.
To install PFERD or update your installation to the latest version, run this To install PFERD or update your installation to the latest version, run this
wherever you want to install or have already installed PFERD: wherever you want to install or have already installed PFERD:
``` ```
$ pip install git+https://github.com/Garmelon/PFERD@v2.2.1 $ pip install git+https://github.com/Garmelon/PFERD@v2.4.5
``` ```
The use of [venv] is recommended. The use of [venv] is recommended.
@ -42,8 +60,8 @@ $ mkdir Vorlesungen
$ cd Vorlesungen $ cd Vorlesungen
$ python3 -m venv .venv $ python3 -m venv .venv
$ .venv/bin/activate $ .venv/bin/activate
$ pip install git+https://github.com/Garmelon/PFERD@v2.2.1 $ pip install git+https://github.com/Garmelon/PFERD@v2.4.5
$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.2.1/example_config.py $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.5/example_config.py
$ python3 example_config.py $ python3 example_config.py
$ deactivate $ deactivate
``` ```

View File

@ -2,7 +2,7 @@ from setuptools import find_packages, setup
setup( setup(
name="PFERD", name="PFERD",
version="2.2.1", version="2.4.5",
packages=find_packages(), packages=find_packages(),
install_requires=[ install_requires=[
"requests>=2.21.0", "requests>=2.21.0",

69
sync_url.py Executable file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
"""
A simple script to download a course by name from ILIAS.
"""
import argparse
from pathlib import Path
from urllib.parse import urlparse
from PFERD import Pferd
from PFERD.cookie_jar import CookieJar
from PFERD.ilias import (IliasCrawler, IliasElementType,
KitShibbolethAuthenticator)
from PFERD.transform import sanitize_windows_path
from PFERD.utils import to_path
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--test-run", action="store_true")
parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in")
parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos")
parser.add_argument('url', help="URL to the course page")
parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into")
args = parser.parse_args()
url = urlparse(args.url)
cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
session = cookie_jar.create_session()
authenticator = KitShibbolethAuthenticator()
crawler = IliasCrawler(url.scheme + '://' + url.netloc, session,
authenticator, lambda x, y: True)
cookie_jar.load_cookies()
if args.folder is not None:
folder = args.folder
# Initialize pferd at the *parent of the passed folder*
# This is needed so Pferd's internal protections against escaping the working directory
# do not trigger (e.g. if somebody names a file in ILIAS '../../bad thing.txt')
pferd = Pferd(Path(Path(__file__).parent, folder).parent, test_run=args.test_run)
else:
# fetch course name from ilias
folder = crawler.find_element_name(args.url)
cookie_jar.save_cookies()
# Initialize pferd at the location of the script
pferd = Pferd(Path(__file__).parent, test_run=args.test_run)
def dir_filter(_: Path, element: IliasElementType) -> bool:
if args.no_videos:
return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER]
return True
pferd.enable_logging()
# fetch
pferd.ilias_kit_folder(
target=folder,
full_url=args.url,
cookies=args.cookies,
dir_filter=dir_filter,
transform=sanitize_windows_path
)
if __name__ == "__main__":
main()