Compare commits

..

38 Commits

Author SHA1 Message Date
8e8c1c031a Version 2.3.0 2020-09-03 21:47:10 +02:00
55678d7fee Pass string down to FileCookieJar
Some python versions just can't handle it *despite the documentation
stating they should*.
2020-08-12 09:09:14 +02:00
a57ee8b96b Add timeout to video downloads to work around requests IPv6 bug 2020-08-11 14:40:30 +02:00
e367da925e Bump version to 2.2.1 2020-07-28 19:55:32 +00:00
77a109bb7e Fix ilias shibboleth authenticator
The shibboleth site got a visual overhaul that slightly changed the classes of a
form we need.
2020-07-28 19:13:51 +00:00
a3e1864a26 Allow long paths on windows
If you start PFERD a few folders deep in your home directory, it is
quite easy to reach the maximum path length limit on Windows (260
chars). This patch opts in to long paths ("\\?\" prefix) which lift that
restriction at the cost of ugly path names.
2020-07-25 13:44:49 +02:00
41cbcc509c Update version to 2.2.0 2020-07-15 22:47:44 +02:00
77874b432b Also add personal_desktop to download summary 2020-07-15 22:47:44 +02:00
5c4c785e60 Fix HTML file downloading
Previously PFERD thought any HTML file was a "Error, no access" page
when downloading. Now it checks whether ILIAS sends a
content-disposition header, telling the browser to download the file. If
that is the case, it was just a HTML file uploaded to ILIAS. If it has
no header, it is probably an error message.
2020-07-15 15:12:14 +02:00
2aed4f6d1f Only query the dir_filter for directories 2020-07-13 13:36:12 +02:00
34152fbe54 Set mtime and atime to ILIAS dates where possible 2020-07-13 13:29:18 +02:00
4047fe78f3 Fix README formatting 2020-07-11 18:22:33 +00:00
c28347122e Improve README
- Added a table of contents
- Reworked the transform section
- Fixed the commented example
2020-07-11 18:16:33 +00:00
5b38ab8cf1 Add MIT license 2020-07-08 09:46:27 +00:00
bb25d32f03 Fix typo in README 2020-06-29 16:18:33 +02:00
ecaedea709 Merge pull request #8 from pavelzw/master
Fix version number
2020-06-26 17:52:05 +02:00
f05d1b1261 Fix version number 2020-06-26 17:49:47 +02:00
6aaa3071f9 Update README with new version 2020-06-26 17:35:03 +02:00
c26c9352f1 Make DownloadSummary private, provide property accessors 2020-06-26 17:30:45 +02:00
d9ea688145 Use pretty logger for summaries 2020-06-26 17:24:36 +02:00
e8be6e498e Add summary to example_config_personal_desktop 2020-06-26 17:24:36 +02:00
e4b1fac045 Satisfy pylint 2020-06-26 15:38:22 +02:00
402ae81335 Fix type hints 2020-06-26 13:17:44 +00:00
52f31e2783 Add type hints to DownloadSummary 2020-06-26 13:02:37 +02:00
739522a151 Move download summary into a separate class 2020-06-25 23:07:11 +02:00
6c034209b6 Add deleted files to summary 2020-06-25 22:00:28 +02:00
f6fbd5e4bb Add download summary 2020-06-25 19:19:34 +02:00
7024db1f13 Use transient progessbar
This will ensure no pesky newline ends up in the output, even on
windows.
2020-06-25 18:03:12 +02:00
23bfa42a0d Never use the direct download button, as it is currently broken 2020-06-11 13:31:01 +02:00
fdb57884ed Touch files with same content to update timestamps 2020-05-31 20:27:15 +02:00
f614b95a00 Adjust version in setup.py 2020-05-30 19:07:02 +02:00
8198c9ecaa Reorder methods a bit 2020-05-30 19:06:36 +02:00
086b15d10f Crawl a bit more iteratively 2020-05-30 15:47:15 +02:00
9d6ce331a5 Use IliasCrawlerEntry entries in the ilias scraper 2020-05-30 15:20:51 +02:00
821c7ade26 Move video url extraction logic to crawler 2020-05-30 00:22:31 +02:00
b969a1854a Remove unneeded whitespace 2020-05-30 00:22:31 +02:00
62535b4452 Unpack videos in ILIAS downloader 2020-05-21 22:12:52 +02:00
c0056e5669 Correctly crawl video pages with multiple pages 2020-05-21 21:38:07 +02:00
16 changed files with 672 additions and 261 deletions

3
.gitignore vendored
View File

@ -1,5 +1,8 @@
__pycache__/
.venv/
venv/
.idea/
build/
.mypy_cache/
.tmp/
.env

18
LICENSE Normal file
View File

@ -0,0 +1,18 @@
Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -22,7 +22,7 @@ class CookieJar:
if cookie_file is None:
self._cookies = LWPCookieJar()
else:
self._cookies = LWPCookieJar(cookie_file)
self._cookies = LWPCookieJar(str(cookie_file.resolve()))
@property
def cookies(self) -> LWPCookieJar:

69
PFERD/download_summary.py Normal file
View File

@ -0,0 +1,69 @@
"""
Provides a summary that keeps track of new modified or deleted files.
"""
from pathlib import Path
from typing import List
class DownloadSummary:
"""
Keeps track of all new, modified or deleted files and provides a summary.
"""
def __init__(self) -> None:
self._new_files: List[Path] = []
self._modified_files: List[Path] = []
self._deleted_files: List[Path] = []
@property
def new_files(self) -> List[Path]:
"""
Returns all new files.
"""
return self._new_files.copy()
@property
def modified_files(self) -> List[Path]:
"""
Returns all modified files.
"""
return self._modified_files.copy()
@property
def deleted_files(self) -> List[Path]:
"""
Returns all deleted files.
"""
return self._deleted_files.copy()
def merge(self, summary: 'DownloadSummary') -> None:
"""
Merges ourselves with the passed summary. Modifies this object, but not the passed one.
"""
self._new_files += summary.new_files
self._modified_files += summary.modified_files
self._deleted_files += summary.deleted_files
def add_deleted_file(self, path: Path) -> None:
"""
Registers a file as deleted.
"""
self._deleted_files.append(path)
def add_modified_file(self, path: Path) -> None:
"""
Registers a file as changed.
"""
self._modified_files.append(path)
def add_new_file(self, path: Path) -> None:
"""
Registers a file as new.
"""
self._new_files.append(path)
def has_updates(self) -> bool:
"""
Returns whether this summary has any updates.
"""
return bool(self._new_files or self._modified_files or self._deleted_files)

View File

@ -3,7 +3,8 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/).
"""
from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator
from .crawler import IliasCrawler, IliasDirectoryFilter, IliasDirectoryType
from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter,
IliasElementType)
from .downloader import (IliasDownloader, IliasDownloadInfo,
IliasDownloadStrategy, download_everything,
download_modified_or_new)

View File

@ -67,7 +67,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
while not self._login_successful(soup):
# Searching the form here so that this fails before asking for
# credentials rather than after asking.
form = soup.find("form", {"class": "form2", "method": "post"})
form = soup.find("form", {"class": "full content", "method": "post"})
action = form["action"]
# Equivalent: Enter credentials in

View File

@ -8,7 +8,7 @@ import logging
import re
from enum import Enum
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
urlunsplit)
@ -26,16 +26,58 @@ LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)
class IliasDirectoryType(Enum):
class IliasElementType(Enum):
"""
The type of an ilias directory.
The type of an ilias element.
"""
FOLDER = "FOLDER"
VIDEO = "VIDEO"
EXERCISE = "EXERCISE"
REGULAR_FOLDER = "REGULAR_FOLDER"
VIDEO_FOLDER = "VIDEO_FOLDER"
EXERCISE_FOLDER = "EXERCISE_FOLDER"
REGULAR_FILE = "REGULAR_FILE"
VIDEO_FILE = "VIDEO_FILE"
FORUM = "FORUM"
EXTERNAL_LINK = "EXTERNAL_LINK"
def is_folder(self) -> bool:
"""
Returns whether this type is some kind of folder.
"""
return "FOLDER" in str(self.name)
IliasDirectoryFilter = Callable[[Path, IliasDirectoryType], bool]
IliasDirectoryFilter = Callable[[Path, IliasElementType], bool]
class IliasCrawlerEntry:
# pylint: disable=too-few-public-methods
"""
An ILIAS crawler entry used internally to find, catalogue and recursively crawl elements.
"""
def __init__(
self,
path: Path,
url: Union[str, Callable[[], Optional[str]]],
entry_type: IliasElementType,
modification_date: Optional[datetime.datetime]
):
self.path = path
if isinstance(url, str):
str_url = url
self.url: Callable[[], Optional[str]] = lambda: str_url
else:
self.url = url
self.entry_type = entry_type
self.modification_date = modification_date
def to_download_info(self) -> Optional[IliasDownloadInfo]:
"""
Converts this crawler entry to an IliasDownloadInfo, if possible.
This method will only succeed for *File* types.
"""
if self.entry_type in [IliasElementType.REGULAR_FILE, IliasElementType.VIDEO_FILE]:
return IliasDownloadInfo(self.path, self.url, self.modification_date)
return None
class IliasCrawler:
@ -62,12 +104,6 @@ class IliasCrawler:
self._authenticator = authenticator
self.dir_filter = dir_filter
def _abs_url_from_link(self, link_tag: bs4.Tag) -> str:
"""
Create an absolute url from an <a> tag.
"""
return urljoin(self._base_url, link_tag.get("href"))
@staticmethod
def _url_set_query_param(url: str, param: str, value: str) -> str:
"""
@ -102,7 +138,8 @@ class IliasCrawler:
)
# And treat it as a folder
return self._crawl_folder(Path(""), root_url)
entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url)
return self._iterate_entries_to_download_infos(entries)
def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
response: requests.Response = self._session.get(root_url)
@ -115,14 +152,92 @@ class IliasCrawler:
Raises:
FatalException: if an unrecoverable error occurs
"""
return self._crawl_folder(Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI")
entries: List[IliasCrawlerEntry] = self._crawl_folder(
Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI"
)
return self._iterate_entries_to_download_infos(entries)
def _switch_on_crawled_type(
def _iterate_entries_to_download_infos(
self,
entries: List[IliasCrawlerEntry]
) -> List[IliasDownloadInfo]:
result: List[IliasDownloadInfo] = []
entries_to_process: List[IliasCrawlerEntry] = entries.copy()
while len(entries_to_process) > 0:
entry = entries_to_process.pop()
if entry.entry_type == IliasElementType.EXTERNAL_LINK:
PRETTY.not_searching(entry.path, "external link")
continue
if entry.entry_type == IliasElementType.FORUM:
PRETTY.not_searching(entry.path, "forum")
continue
if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type):
PRETTY.not_searching(entry.path, "user filter")
continue
download_info = entry.to_download_info()
if download_info is not None:
result.append(download_info)
continue
url = entry.url()
if url is None:
PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it")
continue
PRETTY.searching(entry.path)
if entry.entry_type == IliasElementType.EXERCISE_FOLDER:
entries_to_process += self._crawl_exercises(entry.path, url)
continue
if entry.entry_type == IliasElementType.REGULAR_FOLDER:
entries_to_process += self._crawl_folder(entry.path, url)
continue
if entry.entry_type == IliasElementType.VIDEO_FOLDER:
entries_to_process += self._crawl_video_directory(entry.path, url)
continue
return result
def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]:
"""
Crawl all files in a folder-like element.
"""
soup = self._get_page(url, {})
result: List[IliasCrawlerEntry] = []
# Fetch all links and throw them to the general interpreter
links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle")
for link in links:
abs_url = self._abs_url_from_link(link)
element_path = Path(folder_path, link.getText().strip())
element_type = self._find_type_from_link(element_path, link, abs_url)
if element_type == IliasElementType.REGULAR_FILE:
result += self._crawl_file(folder_path, link, abs_url)
elif element_type is not None:
result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)]
else:
PRETTY.warning(f"Found element without a type at {str(element_path)!r}")
return result
def _abs_url_from_link(self, link_tag: bs4.Tag) -> str:
"""
Create an absolute url from an <a> tag.
"""
return urljoin(self._base_url, link_tag.get("href"))
@staticmethod
def _find_type_from_link(
path: Path,
link_element: bs4.Tag,
url: str
) -> List[IliasDownloadInfo]:
) -> Optional[IliasElementType]:
"""
Decides which sub crawler to use for a given top level element.
"""
@ -131,28 +246,64 @@ class IliasCrawler:
# file URLs contain "target=file"
if "target=file_" in parsed_url.query:
LOGGER.debug("Interpreted as file.")
return self._crawl_file(path, link_element, url)
return IliasElementType.REGULAR_FILE
# Skip forums
if "cmd=showThreads" in parsed_url.query:
LOGGER.debug("Skipping forum %r", url)
return []
return IliasElementType.FORUM
# Everything with a ref_id can *probably* be opened to reveal nested things
# video groups, directories, exercises, etc
if "ref_id=" in parsed_url.query:
LOGGER.debug("Processing folder-like...")
return self._switch_on_folder_like(path, link_element, url)
return IliasCrawler._find_type_from_folder_like(link_element, url)
PRETTY.warning(
"Got unkwarning element type in switch. I am not sure what horror I found on the"
"Got unknown element type in switch. I am not sure what horror I found on the"
f" ILIAS page. The element was at {str(path)!r} and it is {link_element!r})"
)
return []
return None
@staticmethod
def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasDownloadInfo]:
def _find_type_from_folder_like(link_element: bs4.Tag, url: str) -> Optional[IliasElementType]:
"""
Try crawling something that looks like a folder.
"""
# pylint: disable=too-many-return-statements
# We look for the outer div of our inner link, to find information around it
# (mostly the icon)
for parent in link_element.parents:
if "ilContainerListItemOuter" in parent["class"]:
found_parent = parent
break
if found_parent is None:
PRETTY.warning(f"Could not find element icon for {url!r}")
return None
# Find the small descriptive icon to figure out the type
img_tag: Optional[bs4.Tag] = found_parent.select_one("img.ilListItemIcon")
if img_tag is None:
PRETTY.warning(f"Could not find image tag for {url!r}")
return None
if "opencast" in str(img_tag["alt"]).lower():
return IliasElementType.VIDEO_FOLDER
if str(img_tag["src"]).endswith("icon_exc.svg"):
return IliasElementType.EXERCISE_FOLDER
if str(img_tag["src"]).endswith("icon_webr.svg"):
return IliasElementType.EXTERNAL_LINK
if str(img_tag["src"]).endswith("frm.svg"):
return IliasElementType.FORUM
return IliasElementType.REGULAR_FOLDER
@staticmethod
def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasCrawlerEntry]:
"""
Crawls a file.
"""
@ -183,80 +334,11 @@ class IliasCrawler:
name = link_element.getText()
full_path = Path(path, name + "." + file_type)
return [IliasDownloadInfo(full_path, url, modification_date)]
return [
IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date)
]
def _switch_on_folder_like(
self,
parent_path: Path,
link_element: bs4.Tag,
url: str
) -> List[IliasDownloadInfo]:
"""
Try crawling something that looks like a folder.
"""
# pylint: disable=too-many-return-statements
element_path = Path(parent_path, link_element.getText().strip())
found_parent: Optional[bs4.Tag] = None
# We look for the outer div of our inner link, to find information around it
# (mostly the icon)
for parent in link_element.parents:
if "ilContainerListItemOuter" in parent["class"]:
found_parent = parent
break
if found_parent is None:
PRETTY.warning(f"Could not find element icon for {url!r}")
return []
# Find the small descriptive icon to figure out the type
img_tag: Optional[bs4.Tag] = found_parent.select_one("img.ilListItemIcon")
if img_tag is None:
PRETTY.warning(f"Could not find image tag for {url!r}")
return []
directory_type = IliasDirectoryType.FOLDER
if "opencast" in str(img_tag["alt"]).lower():
directory_type = IliasDirectoryType.VIDEO
if str(img_tag["src"]).endswith("icon_exc.svg"):
directory_type = IliasDirectoryType.EXERCISE
if not self.dir_filter(element_path, directory_type):
PRETTY.not_searching(element_path, "user filter")
return []
PRETTY.searching(element_path)
# A forum
if str(img_tag["src"]).endswith("frm.svg"):
LOGGER.debug("Skipping forum at %r", url)
PRETTY.not_searching(element_path, "forum")
return []
# An exercise
if directory_type == IliasDirectoryType.EXERCISE:
LOGGER.debug("Crawling exercises at %r", url)
return self._crawl_exercises(element_path, url)
if str(img_tag["src"]).endswith("icon_webr.svg"):
LOGGER.debug("Skipping external link at %r", url)
PRETTY.not_searching(element_path, "external link")
return []
# Match the opencast video plugin
if directory_type == IliasDirectoryType.VIDEO:
LOGGER.debug("Found video site: %r", url)
return self._crawl_video_directory(element_path, url)
# Assume it is a folder
return self._crawl_folder(element_path, self._abs_url_from_link(link_element))
def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasDownloadInfo]:
def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]:
"""
Crawl the video overview site.
"""
@ -272,6 +354,71 @@ class IliasCrawler:
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
)
# If we find a page selected, we probably need to respect pagination
if self._is_paginated_video_page(video_list_soup):
second_stage_url = self._abs_url_from_link(content_link)
return self._crawl_paginated_video_directory(
video_dir_path, video_list_soup, second_stage_url
)
return self._crawl_video_directory_second_stage(video_dir_path, video_list_soup)
@staticmethod
def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool:
return soup.find(id=re.compile(r"tab_page_sel.+")) is not None
def _crawl_paginated_video_directory(
self,
video_dir_path: Path,
paged_video_list_soup: bs4.BeautifulSoup,
second_stage_url: str
) -> List[IliasCrawlerEntry]:
LOGGER.info("Found paginated video page, trying 800 elements")
# Try to find the table id. This can be used to build the query parameter indicating
# you want 800 elements
table_element: bs4.Tag = paged_video_list_soup.find(
name="table", id=re.compile(r"tbl_xoct_.+")
)
if table_element is None:
PRETTY.warning(
"Could not increase elements per page (table not found)."
" Some might not be crawled!"
)
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"])
if match is None:
PRETTY.warning(
"Could not increase elements per page (table id not found)."
" Some might not be crawled!"
)
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
table_id = match.group(1)
extended_video_page = self._get_page(
second_stage_url,
{f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
)
if self._is_paginated_video_page(extended_video_page):
PRETTY.warning(
"800 elements do not seem to be enough (or I failed to fetch that many)."
" I will miss elements."
)
return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page)
def _crawl_video_directory_second_stage(
self,
video_dir_path: Path,
video_list_soup: bs4.BeautifulSoup
) -> List[IliasCrawlerEntry]:
"""
Crawls the "second stage" video page. This page contains the actual video urls.
"""
direct_download_links: List[bs4.Tag] = video_list_soup.findAll(
name="a", text=re.compile(r"\s*Download\s*")
)
@ -281,10 +428,11 @@ class IliasCrawler:
name="a", text=re.compile(r"\s*Abspielen\s*")
)
results: List[IliasDownloadInfo] = []
results: List[IliasCrawlerEntry] = []
# We can download everything directly!
if len(direct_download_links) == len(video_links):
# FIXME: Sadly the download button is currently broken, so never do that
if False and len(direct_download_links) == len(video_links):
for link in direct_download_links:
results += self._crawl_single_video(video_dir_path, link, True)
else:
@ -298,7 +446,7 @@ class IliasCrawler:
parent_path: Path,
link: bs4.Tag,
direct_download: bool
) -> List[IliasDownloadInfo]:
) -> List[IliasCrawlerEntry]:
"""
Crawl a single video based on its "Abspielen" link from the video listing.
"""
@ -316,46 +464,54 @@ class IliasCrawler:
video_path: Path = Path(parent_path, title)
video_url = self._abs_url_from_link(link)
# The video had a direct download button we can use instead
if direct_download:
LOGGER.debug("Using direct download for video %r", str(video_path))
return [IliasDownloadInfo(
return [IliasCrawlerEntry(
video_path, video_url, IliasElementType.VIDEO_FILE, modification_time
)]
return [IliasCrawlerEntry(
video_path,
self._abs_url_from_link(link),
self._crawl_video_url_from_play_link(video_url),
IliasElementType.VIDEO_FILE,
modification_time
)]
def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]:
def inner() -> Optional[str]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
# on the page, but defined in a JS object inside a script tag, passed to the player
# library.
# We do the impossible and RegEx the stream JSON object out of the page's HTML source
video_page_url = self._abs_url_from_link(link)
video_page_soup = self._get_page(video_page_url, {})
video_page_soup = soupify(self._session.get(play_url))
regex: re.Pattern = re.compile(
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
)
json_match = regex.search(str(video_page_soup))
if json_match is None:
PRETTY.warning(f"Could not find json stream info for {video_page_url!r}")
return []
PRETTY.warning(f"Could not find json stream info for {play_url!r}")
return None
json_str = json_match.group(1)
# parse it
json_object = json.loads(json_str)
# and fetch the video url!
video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"]
return video_url
return inner
return [IliasDownloadInfo(video_path, video_url, modification_time)]
def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasDownloadInfo]:
def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]:
"""
Crawl files offered for download in exercises.
"""
soup = self._get_page(url, {})
results: List[IliasDownloadInfo] = []
results: List[IliasCrawlerEntry] = []
# Each assignment is in an accordion container
assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer")
@ -382,30 +538,15 @@ class IliasCrawler:
LOGGER.debug("Found file %r at %r", file_name, url)
results.append(IliasDownloadInfo(
results.append(IliasCrawlerEntry(
Path(element_path, container_name, file_name),
url,
IliasElementType.REGULAR_FILE,
None # We do not have any timestamp
))
return results
def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasDownloadInfo]:
"""
Crawl all files in a folder-like element.
"""
soup = self._get_page(url, {})
result: List[IliasDownloadInfo] = []
# Fetch all links and throw them to the general interpreter
links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle")
for link in links:
abs_url = self._abs_url_from_link(link)
result += self._switch_on_crawled_type(folder_path, link, abs_url)
return result
def _get_page(self, url: str, params: Dict[str, Any]) -> bs4.BeautifulSoup:
"""
Fetches a page from ILIAS, authenticating when needed.
@ -416,8 +557,10 @@ class IliasCrawler:
content_type = response.headers["content-type"]
if not content_type.startswith("text/html"):
# TODO: Correct exception type
raise Exception(f"Invalid content type {content_type}")
raise FatalException(
f"Invalid content type {content_type} when crawling ilias page"
" {url!r} with {params!r}"
)
soup = soupify(response)

View File

@ -2,9 +2,10 @@
import datetime
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, List, Optional
import math
import os
from pathlib import Path, PurePath
from typing import Callable, List, Optional, Union
import bs4
import requests
@ -24,15 +25,24 @@ class ContentTypeException(Exception):
"""Thrown when the content type of the ilias element can not be handled."""
@dataclass
class IliasDownloadInfo(Transformable):
"""
This class describes a single file to be downloaded.
"""
url: str
modification_date: Optional[datetime.datetime]
# parameters: Dict[str, Any] = field(default_factory=dict)
def __init__(
self,
path: PurePath,
url: Union[str, Callable[[], Optional[str]]],
modifcation_date: Optional[datetime.datetime]
):
super().__init__(path)
if isinstance(url, str):
string_url = url
self.url: Callable[[], Optional[str]] = lambda: string_url
else:
self.url = url
self.modification_date = modifcation_date
IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool]
@ -74,9 +84,13 @@ class IliasDownloader:
session: requests.Session,
authenticator: IliasAuthenticator,
strategy: IliasDownloadStrategy,
timeout: int = 5
):
"""
Create a new IliasDownloader.
The timeout applies to the download request only, as bwcloud uses IPv6
and requests has a problem with that: https://github.com/psf/requests/issues/5522
"""
self._tmp_dir = tmp_dir
@ -84,6 +98,7 @@ class IliasDownloader:
self._session = session
self._authenticator = authenticator
self._strategy = strategy
self._timeout = timeout
def download_all(self, infos: List[IliasDownloadInfo]) -> None:
"""
@ -108,16 +123,30 @@ class IliasDownloader:
tmp_file = self._tmp_dir.new_path()
while not self._try_download(info, tmp_file):
LOGGER.info("Retrying download: %r", info)
self._authenticator.authenticate(self._session)
self._organizer.accept_file(tmp_file, info.path)
dst_path = self._organizer.accept_file(tmp_file, info.path)
if dst_path and info.modification_date:
os.utime(
dst_path,
times=(
math.ceil(info.modification_date.timestamp()),
math.ceil(info.modification_date.timestamp())
)
)
def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
with self._session.get(info.url, stream=True) as response:
content_type = response.headers["content-type"]
url = info.url()
if url is None:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
return True
if content_type.startswith("text/html"):
# Dangit, we're probably not logged in.
with self._session.get(url, stream=True, timeout=self._timeout) as response:
content_type = response.headers["content-type"]
has_content_disposition = "content-disposition" in response.headers
if content_type.startswith("text/html") and not has_content_disposition:
if self._is_logged_in(soupify(response)):
raise ContentTypeException("Attempting to download a web page, not a file")

View File

@ -3,14 +3,18 @@ Contains a few logger utility functions and implementations.
"""
import logging
from typing import Optional
from pathlib import Path
from typing import List, Optional
from rich import print as rich_print
from rich._log_render import LogRender
from rich.console import Console
from rich.panel import Panel
from rich.style import Style
from rich.text import Text
from rich.theme import Theme
from .download_summary import DownloadSummary
from .utils import PathLike, to_path
STYLE = "{"
@ -111,6 +115,15 @@ class PrettyLogger:
f"[bold green]Created {self._format_path(path)}.[/bold green]"
)
def deleted_file(self, path: PathLike) -> None:
"""
A file has been deleted.
"""
self.logger.info(
f"[bold red]Deleted {self._format_path(path)}.[/bold red]"
)
def ignored_file(self, path: PathLike, reason: str) -> None:
"""
File was not downloaded or modified.
@ -138,6 +151,23 @@ class PrettyLogger:
f"([/dim]{reason}[dim]).[/dim]"
)
def summary(self, download_summary: DownloadSummary) -> None:
"""
Prints a download summary.
"""
self.logger.info("")
self.logger.info("[bold cyan]Download Summary[/bold cyan]")
if not download_summary.has_updates():
self.logger.info("[bold dim]Nothing changed![/bold dim]")
return
for new_file in download_summary.new_files:
self.new_file(new_file)
for modified_file in download_summary.modified_files:
self.modified_file(modified_file)
for deleted_files in download_summary.deleted_files:
self.deleted_file(deleted_files)
def starting_synchronizer(
self,
target_directory: PathLike,

View File

@ -5,10 +5,12 @@ A organizer is bound to a single directory.
import filecmp
import logging
import os
import shutil
from pathlib import Path, PurePath
from typing import List, Set
from typing import List, Optional, Set
from .download_summary import DownloadSummary
from .location import Location
from .logging import PrettyLogger
from .utils import prompt_yes_no
@ -32,8 +34,26 @@ class Organizer(Location):
# Keep the root dir
self._known_files.add(path.resolve())
def accept_file(self, src: Path, dst: PurePath) -> None:
"""Move a file to this organizer and mark it."""
self.download_summary = DownloadSummary()
def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]:
"""
Move a file to this organizer and mark it.
Returns the path the file was moved to, to allow the caller to adjust the metadata.
As you might still need to adjust the metadata when the file was identical
(e.g. update the timestamp), the path is also returned in this case.
In all other cases (ignored, not overwritten, etc.) this method returns None.
"""
# Windows limits the path length to 260 for *some* historical reason
# If you want longer paths, you will have to add the "\\?\" prefix in front of
# your path...
# See:
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation
if os.name == 'nt':
src_absolute = Path("\\\\?\\" + str(src.resolve()))
dst_absolute = Path("\\\\?\\" + str(self.resolve(dst)))
else:
src_absolute = src.resolve()
dst_absolute = self.resolve(dst)
@ -49,7 +69,7 @@ class Organizer(Location):
PRETTY.warning(f"File {str(dst_absolute)!r} was already written!")
if not prompt_yes_no(f"Overwrite file?", default=False):
PRETTY.ignored_file(dst_absolute, "file was written previously")
return
return None
# Destination file is directory
if dst_absolute.exists() and dst_absolute.is_dir():
@ -57,7 +77,7 @@ class Organizer(Location):
shutil.rmtree(dst_absolute)
else:
PRETTY.warning(f"Could not add file {str(dst_absolute)!r}")
return
return None
# Destination file exists
if dst_absolute.exists() and dst_absolute.is_file():
@ -65,10 +85,12 @@ class Organizer(Location):
# Bail out, nothing more to do
PRETTY.ignored_file(dst_absolute, "same file contents")
self.mark(dst)
return
return dst_absolute
self.download_summary.add_modified_file(dst_absolute)
PRETTY.modified_file(dst_absolute)
else:
self.download_summary.add_new_file(dst_absolute)
PRETTY.new_file(dst_absolute)
# Create parent dir if needed
@ -80,6 +102,8 @@ class Organizer(Location):
self.mark(dst)
return dst_absolute
def mark(self, path: PurePath) -> None:
"""Mark a file as used so it will not get cleaned up."""
absolute_path = self.resolve(path)
@ -115,9 +139,9 @@ class Organizer(Location):
if start_dir.resolve() not in self._known_files and dir_empty:
start_dir.rmdir()
@staticmethod
def _delete_file_if_confirmed(path: Path) -> None:
def _delete_file_if_confirmed(self, path: Path) -> None:
prompt = f"Do you want to delete {path}"
if prompt_yes_no(prompt, False):
self.download_summary.add_deleted_file(path)
path.unlink()

View File

@ -9,6 +9,7 @@ from typing import Callable, List, Optional, Union
from .cookie_jar import CookieJar
from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler,
diva_download_new)
from .download_summary import DownloadSummary
from .errors import FatalException, swallow_and_print_errors
from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter,
IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy,
@ -42,10 +43,10 @@ class Pferd(Location):
):
super().__init__(Path(base_dir))
self._download_summary = DownloadSummary()
self._tmp_dir = TmpDir(self.resolve(tmp_dir))
self._test_run = test_run
@staticmethod
def enable_logging() -> None:
"""
@ -54,7 +55,6 @@ class Pferd(Location):
enable_logging()
@staticmethod
def _print_transformables(transformables: List[TF]) -> None:
LOGGER.info("")
@ -72,7 +72,8 @@ class Pferd(Location):
dir_filter: IliasDirectoryFilter,
transform: Transform,
download_strategy: IliasDownloadStrategy,
clean: bool = True
timeout: int,
clean: bool = True,
) -> Organizer:
# pylint: disable=too-many-locals
cookie_jar = CookieJar(to_path(cookies) if cookies else None)
@ -81,7 +82,8 @@ class Pferd(Location):
organizer = Organizer(self.resolve(to_path(target)))
crawler = IliasCrawler(base_url, session, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, session, authenticator, download_strategy)
downloader = IliasDownloader(tmp_dir, organizer, session,
authenticator, download_strategy, timeout)
cookie_jar.load_cookies()
info = crawl_function(crawler)
@ -112,6 +114,7 @@ class Pferd(Location):
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
) -> Organizer:
"""
Synchronizes a folder with the ILIAS instance of the KIT.
@ -137,11 +140,14 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
PRETTY.starting_synchronizer(target, "ILIAS", course_id)
return self._ilias(
organizer = self._ilias(
target=target,
base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.crawl_course(course_id),
@ -151,8 +157,19 @@ class Pferd(Location):
transform=transform,
download_strategy=download_strategy,
clean=clean,
timeout=timeout
)
self._download_summary.merge(organizer.download_summary)
return organizer
def print_summary(self) -> None:
"""
Prints the accumulated download summary.
"""
PRETTY.summary(self._download_summary)
@swallow_and_print_errors
def ilias_kit_personal_desktop(
self,
@ -164,6 +181,7 @@ class Pferd(Location):
password: Optional[str] = None,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
) -> Organizer:
"""
Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS
@ -188,11 +206,14 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
"""
# This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password)
PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop")
return self._ilias(
organizer = self._ilias(
target=target,
base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.crawl_personal_desktop(),
@ -202,8 +223,13 @@ class Pferd(Location):
transform=transform,
download_strategy=download_strategy,
clean=clean,
timeout=timeout
)
self._download_summary.merge(organizer.download_summary)
return organizer
@swallow_and_print_errors
def diva_kit(
self,

View File

@ -7,8 +7,7 @@ from types import TracebackType
from typing import Optional, Type
import requests
from rich.console import Console, ConsoleOptions, Control, RenderResult
from rich.live_render import LiveRender
from rich.console import Console
from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID,
TextColumn, TimeRemainingColumn,
TransferSpeedColumn)
@ -23,7 +22,8 @@ _progress: Progress = Progress(
TransferSpeedColumn(),
"",
TimeRemainingColumn(),
console=Console(file=sys.stdout)
console=Console(file=sys.stdout),
transient=True
)
@ -61,18 +61,6 @@ def progress_for(settings: Optional[ProgressSettings]) -> 'ProgressContextManage
return ProgressContextManager(settings)
class _OneLineUp(LiveRender):
"""
Render a control code for moving one line upwards.
"""
def __init__(self) -> None:
super().__init__("not rendered")
def __console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
yield Control(f"\r\x1b[1A")
class ProgressContextManager:
"""
A context manager used for displaying progress.
@ -113,9 +101,6 @@ class ProgressContextManager:
_progress.stop()
_progress.refresh()
# And we existed, so remove the line above (remove_task leaves one behind)
Console().print(_OneLineUp())
return None
def advance(self, amount: float) -> None:

198
README.md
View File

@ -2,35 +2,48 @@
**P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien
- [Installation](#installation)
- [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210)
- [Example setup](#example-setup)
- [Usage](#usage)
- [General concepts](#general-concepts)
- [Constructing transforms](#constructing-transforms)
- [Transform creators](#transform-creators)
- [Transform combinators](#transform-combinators)
- [A short, but commented example](#a-short-but-commented-example)
## Installation
Ensure that you have at least Python 3.8 installed.
To install PFERD or update your installation to the latest version, run this
wherever you want to install/have installed PFERD:
wherever you want to install or have already installed PFERD:
```
$ pip install git+https://github.com/Garmelon/PFERD@v2.0.0
$ pip install git+https://github.com/Garmelon/PFERD@v2.3.0
```
The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
The use of [venv] is recommended.
[venv]: https://docs.python.org/3/library/venv.html
### Upgrading from 2.0.0 to 2.1.0+
- The `IliasDirectoryType` type was renamed to `IliasElementType` and is now far more detailed.
The new values are: `REGULAR_FOLDER`, `VIDEO_FOLDER`, `EXERCISE_FOLDER`, `REGULAR_FILE`, `VIDEO_FILE`, `FORUM`, `EXTERNAL_LINK`.
- Forums and external links are skipped automatically if you use the `kit_ilias` helper.
## Example setup
In this example, `python3` refers to at least Python 3.8.
If you just want to get started and crawl *your entire ILIAS Desktop* instead
of a given set of courses, please replace `example_config.py` with
`example_config_personal_desktop.py` in all of the instructions below (`curl` call and
`python3` run command).
A full example setup and initial use could look like:
```
$ mkdir Vorlesungen
$ cd Vorlesungen
$ python3 -m venv .venv
$ .venv/bin/activate
$ pip install git+https://github.com/Garmelon/PFERD@v2.0.0
$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.0.0/example_config.py
$ pip install git+https://github.com/Garmelon/PFERD@v2.3.0
$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.3.0/example_config.py
$ python3 example_config.py
$ deactivate
```
@ -43,50 +56,93 @@ $ python3 example_config.py
$ deactivate
```
If you just want to get started and crawl *your entire ILIAS Desktop* instead
of a given set of courses, please replace `example_config.py` with
`example_config_personal_desktop.py` in all of the instructions below (`curl` call and
`python3` run command).
## Usage
### General concepts
A PFERD config is a normal python file that starts multiple *synchronizers*
which do all the heavy lifting. While you can create and wire them up manually,
you are encouraged to use the helper methods provided in `PFERD.Pferd`.
The synchronizers take some input arguments specific to their service and a
*transformer*. The transformer receives the computed path of an element in
ILIAS and can return either an output path (so you can rename files or move
them around as you wish) or `None` if you do not want to save the given file.
*transform*. The transform receives the computed path of an element in ILIAS and
can return either an output path (so you can rename files or move them around as
you wish) or `None` if you do not want to save the given file.
Additionally the ILIAS synchronizer allows you to define a *crawl filter*. This
filter also receives the computed path as the input, but is only called or
*directoties*. If you return `True`, the directory will be crawled and
filter also receives the computed path as the input, but is only called for
*directories*. If you return `True`, the directory will be crawled and
searched. If you return `False` the directory will be ignored and nothing in it
will be passed to the transformer.
will be passed to the transform.
In order to help you with writing your own transformers and filters, PFERD
ships with a few powerful building blocks:
### Constructing transforms
| Method | Description |
|--------|-------------|
| `glob` | Returns a transform that returns `None` if the glob does not match and the unmodified path otherwise. |
| `predicate` | Returns a transform that returns `None` if the predicate does not match the path and the unmodified path otherwise. |
| `move_dir(source, target)` | Returns a transform that moves all files from the `source` to the `target` dir. |
| `move(source, target)` | Returns a transform that moves the `source` file to `target`. |
| `rename(old, new)` | Renames a single file. |
| `re_move(regex, sub)` | Moves all files matching the given regular expression. The different captured groups are available under their index and can be used together with normal python format methods: `re_move(r"Blatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf"),`. |
| `re_rename(old, new)` | Same as `re_move` but operates on the path *names* instead of the full path. |
While transforms are just normal python functions, writing them by hand can
quickly become tedious. In order to help you with writing your own transforms
and filters, PFERD defines a few useful transform creators and combinators in
the `PFERD.transform` module:
And PFERD also offers a few combinator functions:
#### Transform creators
* **`keep`**
`keep` just returns the input path unchanged. It can be very useful as the
last argument in an `attempt` call, to leave everything not matching a rule
unchanged.
* **`optionally(transformer)`**
Wraps a given transformer and returns its result if it is not `None`.
These methods let you create a few basic transform building blocks:
- **`glob(glob)`**
Creates a transform that returns the unchanged path if the glob matches the path and `None` otherwise.
See also [Path.match].
Example: `glob("Übung/*.pdf")`
- **`predicate(pred)`**
Creates a transform that returns the unchanged path if `pred(path)` returns a truthy value.
Returns `None` otherwise.
Example: `predicate(lambda path: len(path.parts) == 3)`
- **`move_dir(source, target)`**
Creates a transform that moves all files from the `source` to the `target` directory.
Example: `move_dir("Übung/", "Blätter/")`
- **`move(source, target)`**
Creates a transform that moves the `source` file to `target`.
Example: `move("Vorlesung/VL02_Automten.pdf", "Vorlesung/VL02_Automaten.pdf")`
- **`rename(source, target)`**
Creates a transform that renames all files named `source` to `target`.
This transform works on the file names, not paths, and thus works no matter where the file is located.
Example: `rename("VL02_Automten.pdf", "VL02_Automaten.pdf")`
- **`re_move(regex, target)`**
Creates a transform that moves all files matching `regex` to `target`.
The transform `str.format` on the `target` string with the contents of the capturing groups before returning it.
The capturing groups can be accessed via their index.
See also [Match.group].
Example: `re_move(r"Übung/Blatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf")`
- **`re_rename(regex, target)`**
Creates a transform that renames all files matching `regex` to `target`.
This transform works on the file names, not paths, and thus works no matter where the file is located.
Example: `re_rename(r"VL(\d+)(.*)\.pdf", "Vorlesung_Nr_{1}__{2}.pdf")`
All movement or rename transforms above return `None` if a file doesn't match
their movement or renaming criteria. This enables them to be used as building
blocks to build up more complex transforms.
In addition, `PFERD.transform` also defines the `keep` transform which returns its input path unchanged.
This behaviour can be very useful when creating more complex transforms.
See below for example usage.
[Path.match]: https://docs.python.org/3/library/pathlib.html#pathlib.Path.match
[Match.group]: https://docs.python.org/3/library/re.html#re.Match.group
#### Transform combinators
These methods let you combine transforms into more complex transforms:
- **`optionally(transform)`**
Wraps a given transform and returns its result if it is not `None`.
Otherwise returns the input path unchanged.
* **`do(transformers)`**
`do` accepts a series of transformers and applies them in the given order to
the result of the previous one. If any transformer returns `None`, do
short-circuits and also returns `None`. This can be used to perform multiple
renames in a row:
See below for example usage.
* **`do(transforms)`**
Accepts a series of transforms and applies them in the given order to the result of the previous one.
If any transform returns `None`, `do` short-circuits and also returns `None`.
This can be used to perform multiple renames in a row:
```py
do(
# Move them
@ -95,13 +151,12 @@ And PFERD also offers a few combinator functions:
optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")),
# Remove the 'dbs' prefix (if they have any)
optionally(re_rename("(?i)dbs-(.+)", "{1}")),
),
)
```
* **`attempt(transformers)`**
`attempt` applies the passed transformers in the given order until it finds
one that does not return `None`. If it does not find any, it returns `None`.
This can be used to give a list of possible transformations and it will
automatically pick the first one that fits:
- **`attempt(transforms)`**
Applies the passed transforms in the given order until it finds one that does not return `None`.
If it does not find any, it returns `None`.
This can be used to give a list of possible transformations and automatically pick the first one that fits:
```py
attempt(
# Move all videos. If a video is passed in, this `re_move` will succeed
@ -114,17 +169,26 @@ And PFERD also offers a few combinator functions:
)
```
All of these combinators are used in the provided example config, if you want
to see some more true-to-live usages.
All of these combinators are used in the provided example configs, if you want
to see some more real-life usages.
### A short, but commented example
```py
def filter_course(path: PurePath) -> bool:
# Note that glob returns a Transformer
# - a function from PurePath -> Optional[PurePath]
# So we need to apply the result of 'glob' to our input path.
# We need to crawl the 'Tutorien' folder as it contains the one we want.
from pathlib import Path, PurePath
from PFERD import Pferd
from PFERD.ilias import IliasElementType
from PFERD.transform import *
# This filter will later be used by the ILIAS crawler to decide whether it
# should crawl a directory (or directory-like structure).
def filter_course(path: PurePath, type: IliasElementType) -> bool:
# Note that glob returns a Transform, which is a function from PurePath ->
# Optional[PurePath]. Because of this, we need to apply the result of
# 'glob' to our input path. The returned value will be truthy (a Path) if
# the transform succeeded, or `None` if it failed.
# We need to crawl the 'Tutorien' folder as it contains one that we want.
if glob("Tutorien/")(path):
return True
# If we found 'Tutorium 10', keep it!
@ -137,21 +201,35 @@ def filter_course(path: PurePath) -> bool:
# All other dirs (including subdirs of 'Tutorium 10') should be searched :)
return True
enable_logging() # needed once before calling a Pferd method
# Create a Pferd instance rooted in the same directory as the script file
# This is not a test run, so files will be downloaded (default, can be omitted)
# This transform will later be used to rename a few files. It can also be used
# to ignore some files.
transform_course = attempt(
# We don't care about the other tuts and would instead prefer a cleaner
# directory structure.
move_dir("Tutorien/Tutorium 10/", "Tutorium/"),
# We don't want to modify any other files, so we're going to keep them
# exactly as they are.
keep
)
# Enable and configure the text output. Needs to be called before calling any
# other PFERD methods.
Pferd.enable_logging()
# Create a Pferd instance rooted in the same directory as the script file. This
# is not a test run, so files will be downloaded (default, can be omitted).
pferd = Pferd(Path(__file__).parent, test_run=False)
# Use the ilias_kit helper to synchronize an ILIAS course
pferd.ilias_kit(
# The folder all of the course's content should be placed in
Path("My cool course"),
# The directory that all of the downloaded files should be placed in
"My_cool_course/",
# The course ID (found in the URL when on the course page in ILIAS)
"course id",
# A path to a cookie jar. If you synchronize multiple ILIAS courses, setting this
# to a common value requires you to only login once.
# A path to a cookie jar. If you synchronize multiple ILIAS courses,
# setting this to a common value requires you to only log in once.
cookies=Path("ilias_cookies.txt"),
# A transform to apply to all found paths
# A transform can rename, move or filter out certain files
transform=transform_course,
# A crawl filter limits what paths the cralwer searches
dir_filter=filter_course,

View File

@ -2,7 +2,7 @@ import argparse
from pathlib import Path, PurePath
from PFERD import Pferd
from PFERD.ilias import IliasDirectoryType
from PFERD.ilias import IliasElementType
from PFERD.transform import (attempt, do, glob, keep, move, move_dir,
optionally, re_move, re_rename)
@ -49,7 +49,7 @@ tf_ss_2020_pg = attempt(
)
def df_ss_2020_or1(path: PurePath, _type: IliasDirectoryType) -> bool:
def df_ss_2020_or1(path: PurePath, _type: IliasElementType) -> bool:
if glob("Tutorien/")(path):
return True
if glob("Tutorien/Tutorium 10, dienstags 15:45 Uhr/")(path):
@ -124,6 +124,8 @@ def main() -> None:
cookies="ilias_cookies.txt",
)
# Prints a summary listing all new, modified or deleted files
pferd.print_summary()
if __name__ == "__main__":
main()

View File

@ -30,6 +30,9 @@ def main() -> None:
cookies="ilias_cookies.txt",
)
# Prints a summary listing all new, modified or deleted files
pferd.print_summary()
if __name__ == "__main__":
main()

View File

@ -2,12 +2,12 @@ from setuptools import find_packages, setup
setup(
name="PFERD",
version="2.0.0",
version="2.3.0",
packages=find_packages(),
install_requires=[
"requests>=2.21.0",
"beautifulsoup4>=4.7.1",
"rich>=1.0.0"
"rich>=2.1.0"
],
)