mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-31 04:42:42 +01:00 
			
		
		
		
	Delete old files
I should've done this earlier
This commit is contained in:
		| @@ -1,214 +0,0 @@ | |||||||
| """ |  | ||||||
| General authenticators useful in many situations |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import getpass |  | ||||||
| import logging |  | ||||||
| from typing import Optional, Tuple |  | ||||||
|  |  | ||||||
| from .logging import PrettyLogger |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
| try: |  | ||||||
|     import keyring |  | ||||||
| except ImportError: |  | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TfaAuthenticator: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     An authenticator for a TFA token. Always prompts the user, as the token can not be cached. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, reason: str): |  | ||||||
|         """ |  | ||||||
|         Create a new tfa authenticator. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             reason {str} -- the reason for obtaining the credentials |  | ||||||
|         """ |  | ||||||
|         self._reason = reason |  | ||||||
|  |  | ||||||
|     def get_token(self) -> str: |  | ||||||
|         # pylint: disable=no-self-use |  | ||||||
|         """ |  | ||||||
|         Prompts the user for the token and returns it. |  | ||||||
|         """ |  | ||||||
|         print(f"Enter credentials ({self._reason})") |  | ||||||
|         return getpass.getpass("TFA Token: ") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class UserPassAuthenticator: |  | ||||||
|     """ |  | ||||||
|     An authenticator for username-password combinations that prompts the user |  | ||||||
|     for missing information. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             reason: str, |  | ||||||
|             username: Optional[str] = None, |  | ||||||
|             password: Optional[str] = None, |  | ||||||
|     ) -> None: |  | ||||||
|         """ |  | ||||||
|         reason   - what the credentials are used for |  | ||||||
|         username - the username (if already known) |  | ||||||
|         password - the password (if already known) |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._reason = reason |  | ||||||
|  |  | ||||||
|         self._given_username = username |  | ||||||
|         self._given_password = password |  | ||||||
|  |  | ||||||
|         self._username = username |  | ||||||
|         self._password = password |  | ||||||
|  |  | ||||||
|     def get_credentials(self) -> Tuple[str, str]: |  | ||||||
|         """ |  | ||||||
|         Returns a tuple (username, password). Prompts user for username or |  | ||||||
|         password when necessary. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if self._username is None and self._given_username is not None: |  | ||||||
|             self._username = self._given_username |  | ||||||
|  |  | ||||||
|         if self._password is None and self._given_password is not None: |  | ||||||
|             self._password = self._given_password |  | ||||||
|  |  | ||||||
|         if self._username is None or self._password is None: |  | ||||||
|             print(f"Enter credentials ({self._reason})") |  | ||||||
|  |  | ||||||
|         username: str |  | ||||||
|         if self._username is None: |  | ||||||
|             username = input("Username: ") |  | ||||||
|             self._username = username |  | ||||||
|         else: |  | ||||||
|             username = self._username |  | ||||||
|  |  | ||||||
|         password: str |  | ||||||
|         if self._password is None: |  | ||||||
|             password = getpass.getpass(prompt="Password: ") |  | ||||||
|             self._password = password |  | ||||||
|         else: |  | ||||||
|             password = self._password |  | ||||||
|  |  | ||||||
|         return (username, password) |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def username(self) -> str: |  | ||||||
|         """ |  | ||||||
|         The username. Accessing this property may cause the authenticator to |  | ||||||
|         prompt the user. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         (username, _) = self.get_credentials() |  | ||||||
|         return username |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def password(self) -> str: |  | ||||||
|         """ |  | ||||||
|         The password. Accessing this property may cause the authenticator to |  | ||||||
|         prompt the user. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         (_, password) = self.get_credentials() |  | ||||||
|         return password |  | ||||||
|  |  | ||||||
|     def invalidate_credentials(self) -> None: |  | ||||||
|         """ |  | ||||||
|         Marks the credentials as invalid. If only a username was supplied in |  | ||||||
|         the constructor, assumes that the username is valid and only the |  | ||||||
|         password is invalid. If only a password was supplied in the |  | ||||||
|         constructor, assumes that the password is valid and only the username |  | ||||||
|         is invalid. Otherwise, assumes that username and password are both |  | ||||||
|         invalid. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._username = None |  | ||||||
|         self._password = None |  | ||||||
|  |  | ||||||
|         if self._given_username is not None and self._given_password is not None: |  | ||||||
|             self._given_username = None |  | ||||||
|             self._given_password = None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class KeyringAuthenticator(UserPassAuthenticator): |  | ||||||
|     """ |  | ||||||
|     An authenticator for username-password combinations that stores the |  | ||||||
|     password using the system keyring service and prompts the user for missing |  | ||||||
|     information. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def get_credentials(self) -> Tuple[str, str]: |  | ||||||
|         """ |  | ||||||
|         Returns a tuple (username, password). Prompts user for username or |  | ||||||
|         password when necessary. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if self._username is None and self._given_username is not None: |  | ||||||
|             self._username = self._given_username |  | ||||||
|  |  | ||||||
|         if self._password is None and self._given_password is not None: |  | ||||||
|             self._password = self._given_password |  | ||||||
|  |  | ||||||
|         if self._username is not None and self._password is None: |  | ||||||
|             self._load_password() |  | ||||||
|  |  | ||||||
|         if self._username is None or self._password is None: |  | ||||||
|             print(f"Enter credentials ({self._reason})") |  | ||||||
|  |  | ||||||
|         username: str |  | ||||||
|         if self._username is None: |  | ||||||
|             username = input("Username: ") |  | ||||||
|             self._username = username |  | ||||||
|         else: |  | ||||||
|             username = self._username |  | ||||||
|  |  | ||||||
|         if self._password is None: |  | ||||||
|             self._load_password() |  | ||||||
|  |  | ||||||
|         password: str |  | ||||||
|         if self._password is None: |  | ||||||
|             password = getpass.getpass(prompt="Password: ") |  | ||||||
|             self._password = password |  | ||||||
|             self._save_password() |  | ||||||
|         else: |  | ||||||
|             password = self._password |  | ||||||
|  |  | ||||||
|         return (username, password) |  | ||||||
|  |  | ||||||
|     def _load_password(self) -> None: |  | ||||||
|         """ |  | ||||||
|         Loads the saved password associated with self._username from the system |  | ||||||
|         keyring service (or None if not password has been saved yet) and stores |  | ||||||
|         it in self._password. |  | ||||||
|         """ |  | ||||||
|         self._password = keyring.get_password("pferd-ilias", self._username) |  | ||||||
|  |  | ||||||
|     def _save_password(self) -> None: |  | ||||||
|         """ |  | ||||||
|         Saves self._password to the system keyring service and associates it |  | ||||||
|         with self._username. |  | ||||||
|         """ |  | ||||||
|         keyring.set_password("pferd-ilias", self._username, self._password) |  | ||||||
|  |  | ||||||
|     def invalidate_credentials(self) -> None: |  | ||||||
|         """ |  | ||||||
|         Marks the credentials as invalid. If only a username was supplied in |  | ||||||
|         the constructor, assumes that the username is valid and only the |  | ||||||
|         password is invalid. If only a password was supplied in the |  | ||||||
|         constructor, assumes that the password is valid and only the username |  | ||||||
|         is invalid. Otherwise, assumes that username and password are both |  | ||||||
|         invalid. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         try: |  | ||||||
|             keyring.delete_password("pferd-ilias", self._username) |  | ||||||
|         except keyring.errors.PasswordDeleteError: |  | ||||||
|             pass |  | ||||||
|  |  | ||||||
|         super().invalidate_credentials() |  | ||||||
| @@ -1,69 +0,0 @@ | |||||||
| """A helper for requests cookies.""" |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| from http.cookiejar import LoadError, LWPCookieJar |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class CookieJar: |  | ||||||
|     """A cookie jar that can be persisted.""" |  | ||||||
|  |  | ||||||
|     def __init__(self, cookie_file: Optional[Path] = None) -> None: |  | ||||||
|         """Create a new cookie jar at the given path. |  | ||||||
|  |  | ||||||
|         If the path is None, the cookies will not be persisted. |  | ||||||
|         """ |  | ||||||
|         self._cookies: LWPCookieJar |  | ||||||
|         if cookie_file is None: |  | ||||||
|             self._cookies = LWPCookieJar() |  | ||||||
|         else: |  | ||||||
|             self._cookies = LWPCookieJar(str(cookie_file.resolve())) |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def cookies(self) -> LWPCookieJar: |  | ||||||
|         """Return the requests cookie jar.""" |  | ||||||
|         return self._cookies |  | ||||||
|  |  | ||||||
|     def load_cookies(self) -> None: |  | ||||||
|         """Load all cookies from the file given in the constructor.""" |  | ||||||
|         if self._cookies.filename is None: |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         try: |  | ||||||
|             LOGGER.info("Loading old cookies from %s", self._cookies.filename) |  | ||||||
|             self._cookies.load(ignore_discard=True) |  | ||||||
|         except (FileNotFoundError, LoadError): |  | ||||||
|             LOGGER.warning( |  | ||||||
|                 "No valid cookie file found at %s, continuing with no cookies", |  | ||||||
|                 self._cookies.filename |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     def save_cookies(self, reason: Optional[str] = None) -> None: |  | ||||||
|         """Save the cookies in the file given in the constructor.""" |  | ||||||
|         if self._cookies.filename is None: |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         if reason is None: |  | ||||||
|             LOGGER.info("Saving cookies") |  | ||||||
|         else: |  | ||||||
|             LOGGER.info("Saving cookies (%s)", reason) |  | ||||||
|  |  | ||||||
|         # TODO figure out why ignore_discard is set |  | ||||||
|         # TODO possibly catch a few more exceptions |  | ||||||
|         self._cookies.save(ignore_discard=True) |  | ||||||
|  |  | ||||||
|     def create_session(self) -> requests.Session: |  | ||||||
|         """Create a new session using the cookie jar.""" |  | ||||||
|         sess = requests.Session() |  | ||||||
|  |  | ||||||
|         # From the request docs: "All requests code should work out of the box |  | ||||||
|         # with externally provided instances of CookieJar, e.g. LWPCookieJar |  | ||||||
|         # and FileCookieJar." |  | ||||||
|         sess.cookies = self.cookies  # type: ignore |  | ||||||
|  |  | ||||||
|         return sess |  | ||||||
							
								
								
									
										169
									
								
								PFERD/diva.py
									
									
									
									
									
								
							
							
						
						
									
										169
									
								
								PFERD/diva.py
									
									
									
									
									
								
							| @@ -1,169 +0,0 @@ | |||||||
| """ |  | ||||||
| Utility functions and a scraper/downloader for the KIT DIVA portal. |  | ||||||
| """ |  | ||||||
| import logging |  | ||||||
| import re |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Any, Callable, List, Optional |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from .errors import FatalException |  | ||||||
| from .logging import PrettyLogger |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import Transformable |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class DivaDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     Information about a DIVA video |  | ||||||
|     """ |  | ||||||
|     url: str |  | ||||||
|  |  | ||||||
|  |  | ||||||
| DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool: |  | ||||||
|     """ |  | ||||||
|     Accepts only new files. |  | ||||||
|     """ |  | ||||||
|     resolved_file = organizer.resolve(info.path) |  | ||||||
|     if not resolved_file.exists(): |  | ||||||
|         return True |  | ||||||
|     PRETTY.ignored_file(info.path, "local file exists") |  | ||||||
|     return False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DivaPlaylistCrawler: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     A crawler for DIVA playlists. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     _PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/" |  | ||||||
|     _COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json" |  | ||||||
|  |  | ||||||
|     def __init__(self, playlist_id: str): |  | ||||||
|         self._id = playlist_id |  | ||||||
|  |  | ||||||
|     @classmethod |  | ||||||
|     def fetch_id(cls, playlist_link: str) -> str: |  | ||||||
|         """ |  | ||||||
|         Fetches the ID for a playerlist, given the base link |  | ||||||
|         (e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271). |  | ||||||
|  |  | ||||||
|         Raises a FatalException, if the id can not be resolved |  | ||||||
|         """ |  | ||||||
|         match = re.match(r".+#/details/(.+)", playlist_link) |  | ||||||
|         if match is None: |  | ||||||
|             raise FatalException( |  | ||||||
|                 "DIVA: Invalid playlist link format, could not extract details." |  | ||||||
|             ) |  | ||||||
|         base_name = match.group(1) |  | ||||||
|  |  | ||||||
|         response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json") |  | ||||||
|  |  | ||||||
|         if response.status_code != 200: |  | ||||||
|             raise FatalException( |  | ||||||
|                 f"DIVA: Got non-200 status code ({response.status_code}))" |  | ||||||
|                 f"when requesting {response.url!r}!" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         body = response.json() |  | ||||||
|  |  | ||||||
|         if body["error"]: |  | ||||||
|             raise FatalException(f"DIVA: Server returned error {body['error']!r}.") |  | ||||||
|  |  | ||||||
|         return body["result"]["collection"]["id"] |  | ||||||
|  |  | ||||||
|     def crawl(self) -> List[DivaDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls the playlist given in the constructor. |  | ||||||
|         """ |  | ||||||
|         response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) |  | ||||||
|         if response.status_code != 200: |  | ||||||
|             raise FatalException(f"Server returned status {response.status_code}.") |  | ||||||
|  |  | ||||||
|         body = response.json() |  | ||||||
|  |  | ||||||
|         if body["error"]: |  | ||||||
|             raise FatalException(f"Server returned error {body['error']!r}.") |  | ||||||
|  |  | ||||||
|         result = body["result"] |  | ||||||
|  |  | ||||||
|         if result["resultCount"] > result["pageSize"]: |  | ||||||
|             PRETTY.warning("Did not receive all results, some will be missing") |  | ||||||
|  |  | ||||||
|         download_infos: List[DivaDownloadInfo] = [] |  | ||||||
|  |  | ||||||
|         for video in result["resultList"]: |  | ||||||
|             title = video["title"] |  | ||||||
|             collection_title = self._follow_path(["collection", "title"], video) |  | ||||||
|             url = self._follow_path( |  | ||||||
|                 ["resourceList", "derivateList", "mp4", "url"], |  | ||||||
|                 video |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|             if url and collection_title and title: |  | ||||||
|                 path = Path(collection_title, title + ".mp4") |  | ||||||
|                 download_infos.append(DivaDownloadInfo(path, url)) |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}") |  | ||||||
|  |  | ||||||
|         return download_infos |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _follow_path(path: List[str], obj: Any) -> Optional[Any]: |  | ||||||
|         """ |  | ||||||
|         Follows a property path through an object, bailing at the first None. |  | ||||||
|         """ |  | ||||||
|         current = obj |  | ||||||
|         for path_step in path: |  | ||||||
|             if path_step in current: |  | ||||||
|                 current = current[path_step] |  | ||||||
|             else: |  | ||||||
|                 return None |  | ||||||
|         return current |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DivaDownloader: |  | ||||||
|     """ |  | ||||||
|     A downloader for DIVA videos. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy): |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._strategy = strategy |  | ||||||
|         self._session = requests.session() |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[DivaDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: DivaDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a single file. |  | ||||||
|         """ |  | ||||||
|         if not self._strategy(self._organizer, info): |  | ||||||
|             self._organizer.mark(info.path) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         with self._session.get(info.url, stream=True) as response: |  | ||||||
|             if response.status_code == 200: |  | ||||||
|                 tmp_file = self._tmp_dir.new_path() |  | ||||||
|                 stream_to_path(response, tmp_file, info.path.name) |  | ||||||
|                 self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Could not download file, got response {response.status_code}") |  | ||||||
| @@ -1,75 +0,0 @@ | |||||||
| """ |  | ||||||
| Provides a summary that keeps track of new modified or deleted files. |  | ||||||
| """ |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import List |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _mergeNoDuplicate(first: List[Path], second: List[Path]) -> List[Path]: |  | ||||||
|     tmp = list(set(first + second)) |  | ||||||
|     tmp.sort(key=lambda x: str(x.resolve())) |  | ||||||
|     return tmp |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DownloadSummary: |  | ||||||
|     """ |  | ||||||
|     Keeps track of all new, modified or deleted files and provides a summary. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self) -> None: |  | ||||||
|         self._new_files: List[Path] = [] |  | ||||||
|         self._modified_files: List[Path] = [] |  | ||||||
|         self._deleted_files: List[Path] = [] |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def new_files(self) -> List[Path]: |  | ||||||
|         """ |  | ||||||
|         Returns all new files. |  | ||||||
|         """ |  | ||||||
|         return self._new_files.copy() |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def modified_files(self) -> List[Path]: |  | ||||||
|         """ |  | ||||||
|         Returns all modified files. |  | ||||||
|         """ |  | ||||||
|         return self._modified_files.copy() |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def deleted_files(self) -> List[Path]: |  | ||||||
|         """ |  | ||||||
|         Returns all deleted files. |  | ||||||
|         """ |  | ||||||
|         return self._deleted_files.copy() |  | ||||||
|  |  | ||||||
|     def merge(self, summary: 'DownloadSummary') -> None: |  | ||||||
|         """ |  | ||||||
|         Merges ourselves with the passed summary. Modifies this object, but not the passed one. |  | ||||||
|         """ |  | ||||||
|         self._new_files = _mergeNoDuplicate(self._new_files, summary.new_files) |  | ||||||
|         self._modified_files = _mergeNoDuplicate(self._modified_files, summary.modified_files) |  | ||||||
|         self._deleted_files = _mergeNoDuplicate(self._deleted_files, summary.deleted_files) |  | ||||||
|  |  | ||||||
|     def add_deleted_file(self, path: Path) -> None: |  | ||||||
|         """ |  | ||||||
|         Registers a file as deleted. |  | ||||||
|         """ |  | ||||||
|         self._deleted_files.append(path) |  | ||||||
|  |  | ||||||
|     def add_modified_file(self, path: Path) -> None: |  | ||||||
|         """ |  | ||||||
|         Registers a file as changed. |  | ||||||
|         """ |  | ||||||
|         self._modified_files.append(path) |  | ||||||
|  |  | ||||||
|     def add_new_file(self, path: Path) -> None: |  | ||||||
|         """ |  | ||||||
|         Registers a file as new. |  | ||||||
|         """ |  | ||||||
|         self._new_files.append(path) |  | ||||||
|  |  | ||||||
|     def has_updates(self) -> bool: |  | ||||||
|         """ |  | ||||||
|         Returns whether this summary has any updates. |  | ||||||
|         """ |  | ||||||
|         return bool(self._new_files or self._modified_files or self._deleted_files) |  | ||||||
| @@ -1,72 +0,0 @@ | |||||||
| """ |  | ||||||
| General downloaders useful in many situations |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from dataclasses import dataclass, field |  | ||||||
| from typing import Any, Dict, List, Optional |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
| import requests.auth |  | ||||||
|  |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import Transformable |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class HttpDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     This class describes a single file to be downloaded. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     url: str |  | ||||||
|     parameters: Dict[str, Any] = field(default_factory=dict) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class HttpDownloader: |  | ||||||
|     """A HTTP downloader that can handle HTTP basic auth.""" |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             tmp_dir: TmpDir, |  | ||||||
|             organizer: Organizer, |  | ||||||
|             username: Optional[str], |  | ||||||
|             password: Optional[str], |  | ||||||
|     ): |  | ||||||
|         """Create a new http downloader.""" |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._username = username |  | ||||||
|         self._password = password |  | ||||||
|         self._session = self._build_session() |  | ||||||
|  |  | ||||||
|     def _build_session(self) -> requests.Session: |  | ||||||
|         session = requests.Session() |  | ||||||
|         if self._username and self._password: |  | ||||||
|             session.auth = requests.auth.HTTPBasicAuth( |  | ||||||
|                 self._username, self._password |  | ||||||
|             ) |  | ||||||
|         return session |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[HttpDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: HttpDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a single file. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         with self._session.get(info.url, params=info.parameters, stream=True) as response: |  | ||||||
|             if response.status_code == 200: |  | ||||||
|                 tmp_file = self._tmp_dir.new_path() |  | ||||||
|                 stream_to_path(response, tmp_file, info.path.name) |  | ||||||
|                 self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|             else: |  | ||||||
|                 # TODO use proper exception |  | ||||||
|                 raise Exception(f"Could not download file, got response {response.status_code}") |  | ||||||
| @@ -1,57 +0,0 @@ | |||||||
| """ |  | ||||||
| An error logging decorator. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| from typing import Any, Callable, TypeVar, cast |  | ||||||
|  |  | ||||||
| from rich.console import Console |  | ||||||
|  |  | ||||||
| from .logging import PrettyLogger |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FatalException(Exception): |  | ||||||
|     """ |  | ||||||
|     A fatal exception occurred. Recovery is not possible. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| TFun = TypeVar('TFun', bound=Callable[..., Any]) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def swallow_and_print_errors(function: TFun) -> TFun: |  | ||||||
|     """ |  | ||||||
|     Decorates a function, swallows all errors, logs them and returns none if one occurred. |  | ||||||
|     """ |  | ||||||
|     def inner(*args: Any, **kwargs: Any) -> Any: |  | ||||||
|         # pylint: disable=broad-except |  | ||||||
|         try: |  | ||||||
|             return function(*args, **kwargs) |  | ||||||
|         except FatalException as error: |  | ||||||
|             PRETTY.error(str(error)) |  | ||||||
|             return None |  | ||||||
|         except Exception as error: |  | ||||||
|             Console().print_exception() |  | ||||||
|             return None |  | ||||||
|     return cast(TFun, inner) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def retry_on_io_exception(max_retries: int, message: str) -> Callable[[TFun], TFun]: |  | ||||||
|     """ |  | ||||||
|     Decorates a function and retries it on any exception until the max retries count is hit. |  | ||||||
|     """ |  | ||||||
|     def retry(function: TFun) -> TFun: |  | ||||||
|         def inner(*args: Any, **kwargs: Any) -> Any: |  | ||||||
|             for i in range(0, max_retries): |  | ||||||
|                 # pylint: disable=broad-except |  | ||||||
|                 try: |  | ||||||
|                     return function(*args, **kwargs) |  | ||||||
|                 except IOError as error: |  | ||||||
|                     PRETTY.warning(f"Error duing operation '{message}': {error}") |  | ||||||
|                     PRETTY.warning( |  | ||||||
|                         f"Retrying operation '{message}'. Remaining retries: {max_retries - 1 - i}") |  | ||||||
|         return cast(TFun, inner) |  | ||||||
|     return retry |  | ||||||
| @@ -1,10 +0,0 @@ | |||||||
| """ |  | ||||||
| Synchronizing files from ILIAS instances (https://www.ilias.de/). |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator |  | ||||||
| from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, |  | ||||||
|                       IliasElementType) |  | ||||||
| from .downloader import (IliasDownloader, IliasDownloadInfo, |  | ||||||
|                          IliasDownloadStrategy, download_everything, |  | ||||||
|                          download_modified_or_new) |  | ||||||
| @@ -1,138 +0,0 @@ | |||||||
| """ |  | ||||||
| Authenticators that can obtain proper ILIAS session cookies. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import abc |  | ||||||
| import logging |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from ..authenticators import TfaAuthenticator, UserPassAuthenticator |  | ||||||
| from ..utils import soupify |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasAuthenticator(abc.ABC): |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|  |  | ||||||
|     """ |  | ||||||
|     An authenticator that logs an existing requests session into an ILIAS |  | ||||||
|     account. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     @abc.abstractmethod |  | ||||||
|     def authenticate(self, sess: requests.Session) -> None: |  | ||||||
|         """ |  | ||||||
|         Log a requests session into this authenticator's ILIAS account. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class KitShibbolethAuthenticator(IliasAuthenticator): |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|  |  | ||||||
|     """ |  | ||||||
|     Authenticate via KIT's shibboleth system. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, authenticator: Optional[UserPassAuthenticator] = None) -> None: |  | ||||||
|         if authenticator: |  | ||||||
|             self._auth = authenticator |  | ||||||
|         else: |  | ||||||
|             self._auth = UserPassAuthenticator("KIT ILIAS Shibboleth") |  | ||||||
|  |  | ||||||
|         self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") |  | ||||||
|  |  | ||||||
|     def authenticate(self, sess: requests.Session) -> None: |  | ||||||
|         """ |  | ||||||
|         Performs the ILIAS Shibboleth authentication dance and saves the login |  | ||||||
|         cookies it receieves. |  | ||||||
|  |  | ||||||
|         This function should only be called whenever it is detected that you're |  | ||||||
|         not logged in. The cookies obtained should be good for a few minutes, |  | ||||||
|         maybe even an hour or two. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         # Equivalent: Click on "Mit KIT-Account anmelden" button in |  | ||||||
|         # https://ilias.studium.kit.edu/login.php |  | ||||||
|         LOGGER.debug("Begin authentication process with ILIAS") |  | ||||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" |  | ||||||
|         data = { |  | ||||||
|             "sendLogin": "1", |  | ||||||
|             "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", |  | ||||||
|             "target": "/shib_login.php", |  | ||||||
|             "home_organization_selection": "Mit KIT-Account anmelden", |  | ||||||
|         } |  | ||||||
|         soup = soupify(sess.post(url, data=data)) |  | ||||||
|  |  | ||||||
|         # Attempt to login using credentials, if necessary |  | ||||||
|         while not self._login_successful(soup): |  | ||||||
|             # Searching the form here so that this fails before asking for |  | ||||||
|             # credentials rather than after asking. |  | ||||||
|             form = soup.find("form", {"class": "full content", "method": "post"}) |  | ||||||
|             action = form["action"] |  | ||||||
|  |  | ||||||
|             csrf_token = form.find("input", {"name": "csrf_token"})["value"] |  | ||||||
|  |  | ||||||
|             # Equivalent: Enter credentials in |  | ||||||
|             # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO |  | ||||||
|             LOGGER.debug("Attempt to log in to Shibboleth using credentials") |  | ||||||
|             url = "https://idp.scc.kit.edu" + action |  | ||||||
|             data = { |  | ||||||
|                 "_eventId_proceed": "", |  | ||||||
|                 "j_username": self._auth.username, |  | ||||||
|                 "j_password": self._auth.password, |  | ||||||
|                 "csrf_token": csrf_token |  | ||||||
|             } |  | ||||||
|             soup = soupify(sess.post(url, data=data)) |  | ||||||
|  |  | ||||||
|             if self._tfa_required(soup): |  | ||||||
|                 soup = self._authenticate_tfa(sess, soup) |  | ||||||
|  |  | ||||||
|             if not self._login_successful(soup): |  | ||||||
|                 print("Incorrect credentials.") |  | ||||||
|                 self._auth.invalidate_credentials() |  | ||||||
|  |  | ||||||
|         # Equivalent: Being redirected via JS automatically |  | ||||||
|         # (or clicking "Continue" if you have JS disabled) |  | ||||||
|         LOGGER.debug("Redirect back to ILIAS with login information") |  | ||||||
|         relay_state = soup.find("input", {"name": "RelayState"}) |  | ||||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) |  | ||||||
|         url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" |  | ||||||
|         data = {  # using the info obtained in the while loop above |  | ||||||
|             "RelayState": relay_state["value"], |  | ||||||
|             "SAMLResponse": saml_response["value"], |  | ||||||
|         } |  | ||||||
|         sess.post(url, data=data) |  | ||||||
|  |  | ||||||
|     def _authenticate_tfa( |  | ||||||
|             self, |  | ||||||
|             session: requests.Session, |  | ||||||
|             soup: bs4.BeautifulSoup |  | ||||||
|     ) -> bs4.BeautifulSoup: |  | ||||||
|         # Searching the form here so that this fails before asking for |  | ||||||
|         # credentials rather than after asking. |  | ||||||
|         form = soup.find("form", {"method": "post"}) |  | ||||||
|         action = form["action"] |  | ||||||
|  |  | ||||||
|         # Equivalent: Enter token in |  | ||||||
|         # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO |  | ||||||
|         LOGGER.debug("Attempt to log in to Shibboleth with TFA token") |  | ||||||
|         url = "https://idp.scc.kit.edu" + action |  | ||||||
|         data = { |  | ||||||
|             "_eventId_proceed": "", |  | ||||||
|             "j_tokenNumber": self._tfa_auth.get_token() |  | ||||||
|         } |  | ||||||
|         return soupify(session.post(url, data=data)) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _login_successful(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         relay_state = soup.find("input", {"name": "RelayState"}) |  | ||||||
|         saml_response = soup.find("input", {"name": "SAMLResponse"}) |  | ||||||
|         return relay_state is not None and saml_response is not None |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _tfa_required(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         return soup.find(id="j_tokenNumber") is not None |  | ||||||
| @@ -1,684 +0,0 @@ | |||||||
| """ |  | ||||||
| Contains an ILIAS crawler alongside helper functions. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import datetime |  | ||||||
| import json |  | ||||||
| import logging |  | ||||||
| import re |  | ||||||
| from enum import Enum |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Any, Callable, Dict, List, Optional, Union |  | ||||||
| from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, |  | ||||||
|                           urlunsplit) |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from ..errors import FatalException, retry_on_io_exception |  | ||||||
| from ..logging import PrettyLogger |  | ||||||
| from ..utils import soupify |  | ||||||
| from .authenticators import IliasAuthenticator |  | ||||||
| from .date_demangler import demangle_date |  | ||||||
| from .downloader import IliasDownloadInfo |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _sanitize_path_name(name: str) -> str: |  | ||||||
|     return name.replace("/", "-").replace("\\", "-") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasElementType(Enum): |  | ||||||
|     """ |  | ||||||
|     The type of an ilias element. |  | ||||||
|     """ |  | ||||||
|     REGULAR_FOLDER = "REGULAR_FOLDER" |  | ||||||
|     VIDEO_FOLDER = "VIDEO_FOLDER" |  | ||||||
|     EXERCISE_FOLDER = "EXERCISE_FOLDER" |  | ||||||
|     REGULAR_FILE = "REGULAR_FILE" |  | ||||||
|     VIDEO_FILE = "VIDEO_FILE" |  | ||||||
|     FORUM = "FORUM" |  | ||||||
|     MEETING = "MEETING" |  | ||||||
|     EXTERNAL_LINK = "EXTERNAL_LINK" |  | ||||||
|  |  | ||||||
|     def is_folder(self) -> bool: |  | ||||||
|         """ |  | ||||||
|         Returns whether this type is some kind of folder. |  | ||||||
|         """ |  | ||||||
|         return "FOLDER" in str(self.name) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| IliasDirectoryFilter = Callable[[Path, IliasElementType], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasCrawlerEntry: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     An ILIAS crawler entry used internally to find, catalogue and recursively crawl elements. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             path: Path, |  | ||||||
|             url: Union[str, Callable[[], Optional[str]]], |  | ||||||
|             entry_type: IliasElementType, |  | ||||||
|             modification_date: Optional[datetime.datetime] |  | ||||||
|     ): |  | ||||||
|         self.path = path |  | ||||||
|         if isinstance(url, str): |  | ||||||
|             str_url = url |  | ||||||
|             self.url: Callable[[], Optional[str]] = lambda: str_url |  | ||||||
|         else: |  | ||||||
|             self.url = url |  | ||||||
|         self.entry_type = entry_type |  | ||||||
|         self.modification_date = modification_date |  | ||||||
|  |  | ||||||
|     def to_download_info(self) -> Optional[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Converts this crawler entry to an IliasDownloadInfo, if possible. |  | ||||||
|         This method will only succeed for *File* types. |  | ||||||
|         """ |  | ||||||
|         if self.entry_type in [IliasElementType.REGULAR_FILE, IliasElementType.VIDEO_FILE]: |  | ||||||
|             return IliasDownloadInfo(self.path, self.url, self.modification_date) |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasCrawler: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|  |  | ||||||
|     """ |  | ||||||
|     A crawler for ILIAS. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     # pylint: disable=too-many-arguments |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             base_url: str, |  | ||||||
|             session: requests.Session, |  | ||||||
|             authenticator: IliasAuthenticator, |  | ||||||
|             dir_filter: IliasDirectoryFilter |  | ||||||
|     ): |  | ||||||
|         """ |  | ||||||
|         Create a new ILIAS crawler. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._base_url = base_url |  | ||||||
|         self._session = session |  | ||||||
|         self._authenticator = authenticator |  | ||||||
|         self.dir_filter = dir_filter |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _url_set_query_param(url: str, param: str, value: str) -> str: |  | ||||||
|         """ |  | ||||||
|         Set a query parameter in an url, overwriting existing ones with the same name. |  | ||||||
|         """ |  | ||||||
|         scheme, netloc, path, query, fragment = urlsplit(url) |  | ||||||
|         query_parameters = parse_qs(query) |  | ||||||
|         query_parameters[param] = [value] |  | ||||||
|         new_query_string = urlencode(query_parameters, doseq=True) |  | ||||||
|  |  | ||||||
|         return urlunsplit((scheme, netloc, path, new_query_string, fragment)) |  | ||||||
|  |  | ||||||
|     def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls a given url *and all reachable elements in it*. |  | ||||||
|  |  | ||||||
|         Args: |  | ||||||
|             url {str} -- the *full* url to crawl |  | ||||||
|         """ |  | ||||||
|         start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url) |  | ||||||
|         return self._iterate_entries_to_download_infos(start_entries) |  | ||||||
|  |  | ||||||
|     def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Starts the crawl process for a course, yielding a list of elements to (potentially) |  | ||||||
|         download. |  | ||||||
|  |  | ||||||
|         Arguments: |  | ||||||
|             course_id {str} -- the course id |  | ||||||
|  |  | ||||||
|         Raises: |  | ||||||
|             FatalException: if an unrecoverable error occurs or the course id is not valid |  | ||||||
|         """ |  | ||||||
|         # Start crawling at the given course |  | ||||||
|         root_url = self._url_set_query_param( |  | ||||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         if not self._is_course_id_valid(root_url, course_id): |  | ||||||
|             raise FatalException( |  | ||||||
|                 "Invalid course id? I didn't find anything looking like a course!" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         # And treat it as a folder |  | ||||||
|         entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url) |  | ||||||
|         return self._iterate_entries_to_download_infos(entries) |  | ||||||
|  |  | ||||||
|     def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: |  | ||||||
|         response: requests.Response = self._session.get(root_url) |  | ||||||
|         # We were redirected ==> Non-existant ID |  | ||||||
|         if course_id not in response.url: |  | ||||||
|             return False |  | ||||||
|  |  | ||||||
|         link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link") |  | ||||||
|         if not link_element: |  | ||||||
|             return False |  | ||||||
|         # It wasn't a course but a category list, forum, etc. |  | ||||||
|         return "crs_" in link_element.get("value") |  | ||||||
|  |  | ||||||
|     def find_course_name(self, course_id: str) -> Optional[str]: |  | ||||||
|         """ |  | ||||||
|         Returns the name of a given course. None if it is not a valid course |  | ||||||
|         or it could not be found. |  | ||||||
|         """ |  | ||||||
|         course_url = self._url_set_query_param( |  | ||||||
|             self._base_url + "/goto.php", "target", f"crs_{course_id}" |  | ||||||
|         ) |  | ||||||
|         return self.find_element_name(course_url) |  | ||||||
|  |  | ||||||
|     def find_element_name(self, url: str) -> Optional[str]: |  | ||||||
|         """ |  | ||||||
|         Returns the name of the element at the given URL, if it can find one. |  | ||||||
|         """ |  | ||||||
|         focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus") |  | ||||||
|         if not focus_element: |  | ||||||
|             return None |  | ||||||
|         return focus_element.text |  | ||||||
|  |  | ||||||
|     def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls the ILIAS personal desktop (and every subelements that can be reached from there). |  | ||||||
|  |  | ||||||
|         Raises: |  | ||||||
|             FatalException: if an unrecoverable error occurs |  | ||||||
|         """ |  | ||||||
|         entries: List[IliasCrawlerEntry] = self._crawl_folder( |  | ||||||
|             Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI" |  | ||||||
|         ) |  | ||||||
|         return self._iterate_entries_to_download_infos(entries) |  | ||||||
|  |  | ||||||
|     def _iterate_entries_to_download_infos( |  | ||||||
|             self, |  | ||||||
|             entries: List[IliasCrawlerEntry] |  | ||||||
|     ) -> List[IliasDownloadInfo]: |  | ||||||
|         result: List[IliasDownloadInfo] = [] |  | ||||||
|         entries_to_process: List[IliasCrawlerEntry] = entries.copy() |  | ||||||
|         while len(entries_to_process) > 0: |  | ||||||
|             entry = entries_to_process.pop() |  | ||||||
|  |  | ||||||
|             if entry.entry_type == IliasElementType.EXTERNAL_LINK: |  | ||||||
|                 PRETTY.not_searching(entry.path, "external link") |  | ||||||
|                 continue |  | ||||||
|             if entry.entry_type == IliasElementType.FORUM: |  | ||||||
|                 PRETTY.not_searching(entry.path, "forum") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type): |  | ||||||
|                 PRETTY.not_searching(entry.path, "user filter") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             download_info = entry.to_download_info() |  | ||||||
|             if download_info is not None: |  | ||||||
|                 result.append(download_info) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             url = entry.url() |  | ||||||
|  |  | ||||||
|             if url is None: |  | ||||||
|                 PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it") |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             PRETTY.searching(entry.path) |  | ||||||
|  |  | ||||||
|             if entry.entry_type == IliasElementType.EXERCISE_FOLDER: |  | ||||||
|                 entries_to_process += self._crawl_exercises(entry.path, url) |  | ||||||
|                 continue |  | ||||||
|             if entry.entry_type == IliasElementType.REGULAR_FOLDER: |  | ||||||
|                 entries_to_process += self._crawl_folder(entry.path, url) |  | ||||||
|                 continue |  | ||||||
|             if entry.entry_type == IliasElementType.VIDEO_FOLDER: |  | ||||||
|                 entries_to_process += self._crawl_video_directory(entry.path, url) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             PRETTY.warning(f"Unknown type: {entry.entry_type}!") |  | ||||||
|  |  | ||||||
|         return result |  | ||||||
|  |  | ||||||
|     def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl all files in a folder-like element. |  | ||||||
|         """ |  | ||||||
|         soup = self._get_page(url, {}) |  | ||||||
|  |  | ||||||
|         if soup.find(id="headerimage"): |  | ||||||
|             element: bs4.Tag = soup.find(id="headerimage") |  | ||||||
|             if "opencast" in element.attrs["src"].lower(): |  | ||||||
|                 PRETTY.warning(f"Switched to crawling a video at {folder_path}") |  | ||||||
|                 if not self.dir_filter(folder_path, IliasElementType.VIDEO_FOLDER): |  | ||||||
|                     PRETTY.not_searching(folder_path, "user filter") |  | ||||||
|                     return [] |  | ||||||
|                 return self._crawl_video_directory(folder_path, url) |  | ||||||
|  |  | ||||||
|         result: List[IliasCrawlerEntry] = [] |  | ||||||
|  |  | ||||||
|         # Fetch all links and throw them to the general interpreter |  | ||||||
|         links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") |  | ||||||
|         for link in links: |  | ||||||
|             abs_url = self._abs_url_from_link(link) |  | ||||||
|             element_path = Path(folder_path, _sanitize_path_name(link.getText().strip())) |  | ||||||
|             element_type = self._find_type_from_link(element_path, link, abs_url) |  | ||||||
|  |  | ||||||
|             if element_type == IliasElementType.REGULAR_FILE: |  | ||||||
|                 result += self._crawl_file(folder_path, link, abs_url) |  | ||||||
|             elif element_type == IliasElementType.MEETING: |  | ||||||
|                 meeting_name = str(element_path.name) |  | ||||||
|                 date_portion_str = meeting_name.split(" - ")[0] |  | ||||||
|                 date_portion = demangle_date(date_portion_str) |  | ||||||
|  |  | ||||||
|                 if not date_portion: |  | ||||||
|                     result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] |  | ||||||
|                     continue |  | ||||||
|  |  | ||||||
|                 rest_of_name = meeting_name |  | ||||||
|                 if rest_of_name.startswith(date_portion_str): |  | ||||||
|                     rest_of_name = rest_of_name[len(date_portion_str):] |  | ||||||
|  |  | ||||||
|                 new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") \ |  | ||||||
|                     + rest_of_name |  | ||||||
|                 new_path = Path(folder_path, _sanitize_path_name(new_name)) |  | ||||||
|                 result += [ |  | ||||||
|                     IliasCrawlerEntry(new_path, abs_url, IliasElementType.REGULAR_FOLDER, None) |  | ||||||
|                 ] |  | ||||||
|             elif element_type is not None: |  | ||||||
|                 result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Found element without a type at {str(element_path)!r}") |  | ||||||
|  |  | ||||||
|         return result |  | ||||||
|  |  | ||||||
|     def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: |  | ||||||
|         """ |  | ||||||
|         Create an absolute url from an <a> tag. |  | ||||||
|         """ |  | ||||||
|         return urljoin(self._base_url, link_tag.get("href")) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _find_type_from_link( |  | ||||||
|             path: Path, |  | ||||||
|             link_element: bs4.Tag, |  | ||||||
|             url: str |  | ||||||
|     ) -> Optional[IliasElementType]: |  | ||||||
|         """ |  | ||||||
|         Decides which sub crawler to use for a given top level element. |  | ||||||
|         """ |  | ||||||
|         parsed_url = urlparse(url) |  | ||||||
|         LOGGER.debug("Parsed url: %r", parsed_url) |  | ||||||
|  |  | ||||||
|         # file URLs contain "target=file" |  | ||||||
|         if "target=file_" in parsed_url.query: |  | ||||||
|             return IliasElementType.REGULAR_FILE |  | ||||||
|  |  | ||||||
|         # Skip forums |  | ||||||
|         if "cmd=showThreads" in parsed_url.query: |  | ||||||
|             return IliasElementType.FORUM |  | ||||||
|  |  | ||||||
|         # Everything with a ref_id can *probably* be opened to reveal nested things |  | ||||||
|         # video groups, directories, exercises, etc |  | ||||||
|         if "ref_id=" in parsed_url.query: |  | ||||||
|             return IliasCrawler._find_type_from_folder_like(link_element, url) |  | ||||||
|  |  | ||||||
|         PRETTY.warning( |  | ||||||
|             "Got unknown element type in switch. I am not sure what horror I found on the" |  | ||||||
|             f" ILIAS page. The element was at {str(path)!r} and it is {link_element!r})" |  | ||||||
|         ) |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _find_type_from_folder_like(link_element: bs4.Tag, url: str) -> Optional[IliasElementType]: |  | ||||||
|         """ |  | ||||||
|         Try crawling something that looks like a folder. |  | ||||||
|         """ |  | ||||||
|         # pylint: disable=too-many-return-statements |  | ||||||
|  |  | ||||||
|         found_parent: Optional[bs4.Tag] = None |  | ||||||
|  |  | ||||||
|         # We look for the outer div of our inner link, to find information around it |  | ||||||
|         # (mostly the icon) |  | ||||||
|         for parent in link_element.parents: |  | ||||||
|             if "ilContainerListItemOuter" in parent["class"]: |  | ||||||
|                 found_parent = parent |  | ||||||
|                 break |  | ||||||
|  |  | ||||||
|         if found_parent is None: |  | ||||||
|             PRETTY.warning(f"Could not find element icon for {url!r}") |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         # Find the small descriptive icon to figure out the type |  | ||||||
|         img_tag: Optional[bs4.Tag] = found_parent.select_one("img.ilListItemIcon") |  | ||||||
|  |  | ||||||
|         if img_tag is None: |  | ||||||
|             PRETTY.warning(f"Could not find image tag for {url!r}") |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         if "opencast" in str(img_tag["alt"]).lower(): |  | ||||||
|             return IliasElementType.VIDEO_FOLDER |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_exc.svg"): |  | ||||||
|             return IliasElementType.EXERCISE_FOLDER |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("icon_webr.svg"): |  | ||||||
|             return IliasElementType.EXTERNAL_LINK |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("frm.svg"): |  | ||||||
|             return IliasElementType.FORUM |  | ||||||
|  |  | ||||||
|         if str(img_tag["src"]).endswith("sess.svg"): |  | ||||||
|             return IliasElementType.MEETING |  | ||||||
|  |  | ||||||
|         return IliasElementType.REGULAR_FOLDER |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawls a file. |  | ||||||
|         """ |  | ||||||
|         # Files have a list of properties (type, modification date, size, etc.) |  | ||||||
|         # In a series of divs. |  | ||||||
|         # Find the parent containing all those divs, so we can filter our what we need |  | ||||||
|         properties_parent: bs4.Tag = link_element.findParent( |  | ||||||
|             "div", {"class": lambda x: "il_ContainerListItem" in x} |  | ||||||
|         ).select_one(".il_ItemProperties") |  | ||||||
|         # The first one is always the filetype |  | ||||||
|         file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() |  | ||||||
|  |  | ||||||
|         # The rest does not have a stable order. Grab the whole text and reg-ex the date |  | ||||||
|         # out of it |  | ||||||
|         all_properties_text = properties_parent.getText().strip() |  | ||||||
|         modification_date_match = re.search( |  | ||||||
|             r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", |  | ||||||
|             all_properties_text |  | ||||||
|         ) |  | ||||||
|         if modification_date_match is None: |  | ||||||
|             modification_date = None |  | ||||||
|             PRETTY.warning(f"Could not extract start date from {all_properties_text!r}") |  | ||||||
|         else: |  | ||||||
|             modification_date_str = modification_date_match.group(1) |  | ||||||
|             modification_date = demangle_date(modification_date_str) |  | ||||||
|  |  | ||||||
|         # Grab the name from the link text |  | ||||||
|         name = _sanitize_path_name(link_element.getText()) |  | ||||||
|         full_path = Path(path, name + "." + file_type) |  | ||||||
|  |  | ||||||
|         return [ |  | ||||||
|             IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date) |  | ||||||
|         ] |  | ||||||
|  |  | ||||||
|     def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl the video overview site. |  | ||||||
|         """ |  | ||||||
|         initial_soup = self._get_page(url, {}) |  | ||||||
|  |  | ||||||
|         # The page is actually emtpy but contains a much needed token in the link below. |  | ||||||
|         # That token can be used to fetch the *actual* video listing |  | ||||||
|         content_link: bs4.Tag = initial_soup.select_one("#tab_series a") |  | ||||||
|         # Fetch the actual video listing. The given parameters return all videos (max 800) |  | ||||||
|         # in a standalone html page |  | ||||||
|         video_list_soup = self._get_page( |  | ||||||
|             self._abs_url_from_link(content_link), |  | ||||||
|             {"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         # If we find a page selected, we probably need to respect pagination |  | ||||||
|         if self._is_paginated_video_page(video_list_soup): |  | ||||||
|             second_stage_url = self._abs_url_from_link(content_link) |  | ||||||
|  |  | ||||||
|             return self._crawl_paginated_video_directory( |  | ||||||
|                 video_dir_path, video_list_soup, second_stage_url |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         return self._crawl_video_directory_second_stage(video_dir_path, video_list_soup) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         return soup.find(id=re.compile(r"tab_page_sel.+")) is not None |  | ||||||
|  |  | ||||||
|     def _crawl_paginated_video_directory( |  | ||||||
|             self, |  | ||||||
|             video_dir_path: Path, |  | ||||||
|             paged_video_list_soup: bs4.BeautifulSoup, |  | ||||||
|             second_stage_url: str |  | ||||||
|     ) -> List[IliasCrawlerEntry]: |  | ||||||
|         LOGGER.info("Found paginated video page, trying 800 elements") |  | ||||||
|  |  | ||||||
|         # Try to find the table id. This can be used to build the query parameter indicating |  | ||||||
|         # you want 800 elements |  | ||||||
|  |  | ||||||
|         table_element: bs4.Tag = paged_video_list_soup.find( |  | ||||||
|             name="table", id=re.compile(r"tbl_xoct_.+") |  | ||||||
|         ) |  | ||||||
|         if table_element is None: |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "Could not increase elements per page (table not found)." |  | ||||||
|                 " Some might not be crawled!" |  | ||||||
|             ) |  | ||||||
|             return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) |  | ||||||
|  |  | ||||||
|         match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) |  | ||||||
|         if match is None: |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "Could not increase elements per page (table id not found)." |  | ||||||
|                 " Some might not be crawled!" |  | ||||||
|             ) |  | ||||||
|             return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) |  | ||||||
|         table_id = match.group(1) |  | ||||||
|  |  | ||||||
|         extended_video_page = self._get_page( |  | ||||||
|             second_stage_url, |  | ||||||
|             {f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         if self._is_paginated_video_page(extended_video_page): |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "800 elements do not seem to be enough (or I failed to fetch that many)." |  | ||||||
|                 " I will miss elements." |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page) |  | ||||||
|  |  | ||||||
|     def _crawl_video_directory_second_stage( |  | ||||||
|             self, |  | ||||||
|             video_dir_path: Path, |  | ||||||
|             video_list_soup: bs4.BeautifulSoup |  | ||||||
|     ) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawls the "second stage" video page. This page contains the actual video urls. |  | ||||||
|         """ |  | ||||||
|         direct_download_links: List[bs4.Tag] = video_list_soup.findAll( |  | ||||||
|             name="a", text=re.compile(r"\s*Download\s*") |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         # Video start links are marked with an "Abspielen" link |  | ||||||
|         video_links: List[bs4.Tag] = video_list_soup.findAll( |  | ||||||
|             name="a", text=re.compile(r"\s*Abspielen\s*") |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         results: List[IliasCrawlerEntry] = [] |  | ||||||
|  |  | ||||||
|         # We can download everything directly! |  | ||||||
|         # FIXME: Sadly the download button is currently broken, so never do that |  | ||||||
|         if False and len(direct_download_links) == len(video_links): |  | ||||||
|             for link in direct_download_links: |  | ||||||
|                 results += self._crawl_single_video(video_dir_path, link, True) |  | ||||||
|         else: |  | ||||||
|             for link in video_links: |  | ||||||
|                 results += self._crawl_single_video(video_dir_path, link, False) |  | ||||||
|  |  | ||||||
|         return results |  | ||||||
|  |  | ||||||
|     def _crawl_single_video( |  | ||||||
|             self, |  | ||||||
|             parent_path: Path, |  | ||||||
|             link: bs4.Tag, |  | ||||||
|             direct_download: bool |  | ||||||
|     ) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl a single video based on its "Abspielen" link from the video listing. |  | ||||||
|         """ |  | ||||||
|         # The link is part of a table with multiple columns, describing metadata. |  | ||||||
|         # 6th child (1 indexed) is the modification time string |  | ||||||
|         modification_string = link.parent.parent.parent.select_one( |  | ||||||
|             "td.std:nth-child(6)" |  | ||||||
|         ).getText().strip() |  | ||||||
|         modification_time = datetime.datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") |  | ||||||
|  |  | ||||||
|         title = link.parent.parent.parent.select_one( |  | ||||||
|             "td.std:nth-child(3)" |  | ||||||
|         ).getText().strip() |  | ||||||
|         title += ".mp4" |  | ||||||
|  |  | ||||||
|         video_path: Path = Path(parent_path, _sanitize_path_name(title)) |  | ||||||
|  |  | ||||||
|         video_url = self._abs_url_from_link(link) |  | ||||||
|  |  | ||||||
|         # The video had a direct download button we can use instead |  | ||||||
|         if direct_download: |  | ||||||
|             LOGGER.debug("Using direct download for video %r", str(video_path)) |  | ||||||
|             return [IliasCrawlerEntry( |  | ||||||
|                 video_path, video_url, IliasElementType.VIDEO_FILE, modification_time |  | ||||||
|             )] |  | ||||||
|  |  | ||||||
|         return [IliasCrawlerEntry( |  | ||||||
|             video_path, |  | ||||||
|             self._crawl_video_url_from_play_link(video_url), |  | ||||||
|             IliasElementType.VIDEO_FILE, |  | ||||||
|             modification_time |  | ||||||
|         )] |  | ||||||
|  |  | ||||||
|     def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]: |  | ||||||
|         def inner() -> Optional[str]: |  | ||||||
|             # Fetch the actual video page. This is a small wrapper page initializing a javscript |  | ||||||
|             # player. Sadly we can not execute that JS. The actual video stream url is nowhere |  | ||||||
|             # on the page, but defined in a JS object inside a script tag, passed to the player |  | ||||||
|             # library. |  | ||||||
|             # We do the impossible and RegEx the stream JSON object out of the page's HTML source |  | ||||||
|             video_page_soup = soupify(self._session.get(play_url)) |  | ||||||
|             regex: re.Pattern = re.compile( |  | ||||||
|                 r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE |  | ||||||
|             ) |  | ||||||
|             json_match = regex.search(str(video_page_soup)) |  | ||||||
|  |  | ||||||
|             if json_match is None: |  | ||||||
|                 PRETTY.warning(f"Could not find json stream info for {play_url!r}") |  | ||||||
|                 return None |  | ||||||
|             json_str = json_match.group(1) |  | ||||||
|  |  | ||||||
|             # parse it |  | ||||||
|             json_object = json.loads(json_str) |  | ||||||
|             # and fetch the video url! |  | ||||||
|             video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] |  | ||||||
|             return video_url |  | ||||||
|         return inner |  | ||||||
|  |  | ||||||
|     def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]: |  | ||||||
|         """ |  | ||||||
|         Crawl files offered for download in exercises. |  | ||||||
|         """ |  | ||||||
|         soup = self._get_page(url, {}) |  | ||||||
|  |  | ||||||
|         results: List[IliasCrawlerEntry] = [] |  | ||||||
|  |  | ||||||
|         # Each assignment is in an accordion container |  | ||||||
|         assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer") |  | ||||||
|  |  | ||||||
|         for container in assignment_containers: |  | ||||||
|             # Fetch the container name out of the header to use it in the path |  | ||||||
|             container_name = container.select_one(".ilAssignmentHeader").getText().strip() |  | ||||||
|             # Find all download links in the container (this will contain all the files) |  | ||||||
|             files: List[bs4.Tag] = container.findAll( |  | ||||||
|                 name="a", |  | ||||||
|                 # download links contain the given command class |  | ||||||
|                 attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, |  | ||||||
|                 text="Download" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|             LOGGER.debug("Found exercise container %r", container_name) |  | ||||||
|  |  | ||||||
|             # Grab each file as you now have the link |  | ||||||
|             for file_link in files: |  | ||||||
|                 # Two divs, side by side. Left is the name, right is the link ==> get left |  | ||||||
|                 # sibling |  | ||||||
|                 file_name = file_link.parent.findPrevious(name="div").getText().strip() |  | ||||||
|                 file_name = _sanitize_path_name(file_name) |  | ||||||
|                 url = self._abs_url_from_link(file_link) |  | ||||||
|  |  | ||||||
|                 LOGGER.debug("Found file %r at %r", file_name, url) |  | ||||||
|  |  | ||||||
|                 results.append(IliasCrawlerEntry( |  | ||||||
|                     Path(element_path, container_name, file_name), |  | ||||||
|                     url, |  | ||||||
|                     IliasElementType.REGULAR_FILE, |  | ||||||
|                     None  # We do not have any timestamp |  | ||||||
|                 )) |  | ||||||
|  |  | ||||||
|         return results |  | ||||||
|  |  | ||||||
|     @retry_on_io_exception(3, "fetching webpage") |  | ||||||
|     def _get_page(self, url: str, params: Dict[str, Any], |  | ||||||
|                   retry_count: int = 0) -> bs4.BeautifulSoup: |  | ||||||
|         """ |  | ||||||
|         Fetches a page from ILIAS, authenticating when needed. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if retry_count >= 4: |  | ||||||
|             raise FatalException("Could not get a proper page after 4 tries. " |  | ||||||
|                                  "Maybe your URL is wrong, authentication fails continuously, " |  | ||||||
|                                  "your ILIAS connection is spotty or ILIAS is not well.") |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Fetching %r", url) |  | ||||||
|  |  | ||||||
|         response = self._session.get(url, params=params) |  | ||||||
|         content_type = response.headers["content-type"] |  | ||||||
|  |  | ||||||
|         if not content_type.startswith("text/html"): |  | ||||||
|             raise FatalException( |  | ||||||
|                 f"Invalid content type {content_type} when crawling ilias page" |  | ||||||
|                 " {url!r} with {params!r}" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         soup = soupify(response) |  | ||||||
|  |  | ||||||
|         if self._is_logged_in(soup): |  | ||||||
|             return soup |  | ||||||
|  |  | ||||||
|         LOGGER.info("Not authenticated, changing that...") |  | ||||||
|  |  | ||||||
|         self._authenticator.authenticate(self._session) |  | ||||||
|  |  | ||||||
|         return self._get_page(url, params, retry_count + 1) |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         # Normal ILIAS pages |  | ||||||
|         userlog = soup.find("li", {"id": "userlog"}) |  | ||||||
|         if userlog is not None: |  | ||||||
|             LOGGER.debug("Auth: Found #userlog") |  | ||||||
|             return True |  | ||||||
|         # Video listing embeds do not have complete ILIAS html. Try to match them by |  | ||||||
|         # their video listing table |  | ||||||
|         video_table = soup.find( |  | ||||||
|             recursive=True, |  | ||||||
|             name="table", |  | ||||||
|             attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} |  | ||||||
|         ) |  | ||||||
|         if video_table is not None: |  | ||||||
|             LOGGER.debug("Auth: Found #tbl_xoct.+") |  | ||||||
|             return True |  | ||||||
|         # The individual video player wrapper page has nothing of the above. |  | ||||||
|         # Match it by its playerContainer. |  | ||||||
|         if soup.select_one("#playerContainer") is not None: |  | ||||||
|             LOGGER.debug("Auth: Found #playerContainer") |  | ||||||
|             return True |  | ||||||
|         return False |  | ||||||
| @@ -1,51 +0,0 @@ | |||||||
| """ |  | ||||||
| Helper methods to demangle an ILIAS date. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import datetime |  | ||||||
| import locale |  | ||||||
| import logging |  | ||||||
| import re |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| from ..logging import PrettyLogger |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def demangle_date(date: str) -> Optional[datetime.datetime]: |  | ||||||
|     """ |  | ||||||
|     Demangle a given date in one of the following formats: |  | ||||||
|     "Gestern, HH:MM" |  | ||||||
|     "Heute, HH:MM" |  | ||||||
|     "Morgen, HH:MM" |  | ||||||
|     "dd. mon yyyy, HH:MM |  | ||||||
|     """ |  | ||||||
|     saved = locale.setlocale(locale.LC_ALL) |  | ||||||
|     try: |  | ||||||
|         try: |  | ||||||
|             locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') |  | ||||||
|         except locale.Error: |  | ||||||
|             PRETTY.warning( |  | ||||||
|                 "Could not set language to german. Assuming you use english everywhere." |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         date = re.sub(r"\s+", " ", date) |  | ||||||
|         date = re.sub("Gestern|Yesterday", _yesterday().strftime("%d. %b %Y"), date, re.I) |  | ||||||
|         date = re.sub("Heute|Today", datetime.date.today().strftime("%d. %b %Y"), date, re.I) |  | ||||||
|         date = re.sub("Morgen|Tomorrow", _tomorrow().strftime("%d. %b %Y"), date, re.I) |  | ||||||
|         return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M") |  | ||||||
|     except ValueError: |  | ||||||
|         PRETTY.warning(f"Could not parse date {date!r}") |  | ||||||
|         return None |  | ||||||
|     finally: |  | ||||||
|         locale.setlocale(locale.LC_ALL, saved) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _yesterday() -> datetime.date: |  | ||||||
|     return datetime.date.today() - datetime.timedelta(days=1) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _tomorrow() -> datetime.date: |  | ||||||
|     return datetime.date.today() + datetime.timedelta(days=1) |  | ||||||
| @@ -1,173 +0,0 @@ | |||||||
| """Contains a downloader for ILIAS.""" |  | ||||||
|  |  | ||||||
| import datetime |  | ||||||
| import logging |  | ||||||
| import math |  | ||||||
| import os |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
| from typing import Callable, List, Optional, Union |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from ..errors import retry_on_io_exception |  | ||||||
| from ..logging import PrettyLogger |  | ||||||
| from ..organizer import Organizer |  | ||||||
| from ..tmp_dir import TmpDir |  | ||||||
| from ..transform import Transformable |  | ||||||
| from ..utils import soupify, stream_to_path |  | ||||||
| from .authenticators import IliasAuthenticator |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ContentTypeException(Exception): |  | ||||||
|     """Thrown when the content type of the ilias element can not be handled.""" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     This class describes a single file to be downloaded. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             path: PurePath, |  | ||||||
|             url: Union[str, Callable[[], Optional[str]]], |  | ||||||
|             modifcation_date: Optional[datetime.datetime] |  | ||||||
|     ): |  | ||||||
|         super().__init__(path) |  | ||||||
|         if isinstance(url, str): |  | ||||||
|             string_url = url |  | ||||||
|             self.url: Callable[[], Optional[str]] = lambda: string_url |  | ||||||
|         else: |  | ||||||
|             self.url = url |  | ||||||
|         self.modification_date = modifcation_date |  | ||||||
|  |  | ||||||
|  |  | ||||||
| IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def download_everything(organizer: Organizer, info: IliasDownloadInfo) -> bool: |  | ||||||
|     # pylint: disable=unused-argument |  | ||||||
|     """ |  | ||||||
|     Accepts everything. |  | ||||||
|     """ |  | ||||||
|     return True |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def download_modified_or_new(organizer: Organizer, info: IliasDownloadInfo) -> bool: |  | ||||||
|     """ |  | ||||||
|     Accepts new files or files with a more recent modification date. |  | ||||||
|     """ |  | ||||||
|     resolved_file = organizer.resolve(info.path) |  | ||||||
|     if not resolved_file.exists() or info.modification_date is None: |  | ||||||
|         return True |  | ||||||
|     resolved_mod_time_seconds = resolved_file.stat().st_mtime |  | ||||||
|  |  | ||||||
|     # Download if the info is newer |  | ||||||
|     if info.modification_date.timestamp() > resolved_mod_time_seconds: |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     PRETTY.ignored_file(info.path, "local file has newer or equal modification time") |  | ||||||
|     return False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IliasDownloader: |  | ||||||
|     # pylint: disable=too-many-arguments |  | ||||||
|     """A downloader for ILIAS.""" |  | ||||||
|  |  | ||||||
|     def __init__( |  | ||||||
|             self, |  | ||||||
|             tmp_dir: TmpDir, |  | ||||||
|             organizer: Organizer, |  | ||||||
|             session: requests.Session, |  | ||||||
|             authenticator: IliasAuthenticator, |  | ||||||
|             strategy: IliasDownloadStrategy, |  | ||||||
|             timeout: int = 5 |  | ||||||
|     ): |  | ||||||
|         """ |  | ||||||
|         Create a new IliasDownloader. |  | ||||||
|  |  | ||||||
|         The timeout applies to the download request only, as bwcloud uses IPv6 |  | ||||||
|         and requests has a problem with that: https://github.com/psf/requests/issues/5522 |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._session = session |  | ||||||
|         self._authenticator = authenticator |  | ||||||
|         self._strategy = strategy |  | ||||||
|         self._timeout = timeout |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[IliasDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: IliasDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a file from ILIAS. |  | ||||||
|  |  | ||||||
|         Retries authentication until eternity if it could not fetch the file. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Downloading %r", info) |  | ||||||
|  |  | ||||||
|         if not self._strategy(self._organizer, info): |  | ||||||
|             self._organizer.mark(info.path) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         tmp_file = self._tmp_dir.new_path() |  | ||||||
|  |  | ||||||
|         @retry_on_io_exception(3, "downloading file") |  | ||||||
|         def download_impl() -> bool: |  | ||||||
|             if not self._try_download(info, tmp_file): |  | ||||||
|                 LOGGER.info("Re-Authenticating due to download failure: %r", info) |  | ||||||
|                 self._authenticator.authenticate(self._session) |  | ||||||
|                 raise IOError("Scheduled retry") |  | ||||||
|             else: |  | ||||||
|                 return True |  | ||||||
|  |  | ||||||
|         if not download_impl(): |  | ||||||
|             PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...") |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         dst_path = self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|         if dst_path and info.modification_date: |  | ||||||
|             os.utime( |  | ||||||
|                 dst_path, |  | ||||||
|                 times=( |  | ||||||
|                     math.ceil(info.modification_date.timestamp()), |  | ||||||
|                     math.ceil(info.modification_date.timestamp()) |  | ||||||
|                 ) |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: |  | ||||||
|         url = info.url() |  | ||||||
|         if url is None: |  | ||||||
|             PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") |  | ||||||
|             return True |  | ||||||
|  |  | ||||||
|         with self._session.get(url, stream=True, timeout=self._timeout) as response: |  | ||||||
|             content_type = response.headers["content-type"] |  | ||||||
|             has_content_disposition = "content-disposition" in response.headers |  | ||||||
|  |  | ||||||
|             if content_type.startswith("text/html") and not has_content_disposition: |  | ||||||
|                 if self._is_logged_in(soupify(response)): |  | ||||||
|                     raise ContentTypeException("Attempting to download a web page, not a file") |  | ||||||
|  |  | ||||||
|                 return False |  | ||||||
|  |  | ||||||
|             # Yay, we got the file :) |  | ||||||
|             stream_to_path(response, target, info.path.name) |  | ||||||
|             return True |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: |  | ||||||
|         userlog = soup.find("li", {"id": "userlog"}) |  | ||||||
|         return userlog is not None |  | ||||||
							
								
								
									
										154
									
								
								PFERD/ipd.py
									
									
									
									
									
								
							
							
						
						
									
										154
									
								
								PFERD/ipd.py
									
									
									
									
									
								
							| @@ -1,154 +0,0 @@ | |||||||
| """ |  | ||||||
| Utility functions and a scraper/downloader for the IPD pages. |  | ||||||
| """ |  | ||||||
| import datetime |  | ||||||
| import logging |  | ||||||
| import math |  | ||||||
| import os |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Callable, List, Optional |  | ||||||
| from urllib.parse import urljoin |  | ||||||
|  |  | ||||||
| import bs4 |  | ||||||
| import requests |  | ||||||
|  |  | ||||||
| from PFERD.errors import FatalException |  | ||||||
| from PFERD.utils import soupify |  | ||||||
|  |  | ||||||
| from .logging import PrettyLogger |  | ||||||
| from .organizer import Organizer |  | ||||||
| from .tmp_dir import TmpDir |  | ||||||
| from .transform import Transformable |  | ||||||
| from .utils import stream_to_path |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class IpdDownloadInfo(Transformable): |  | ||||||
|     """ |  | ||||||
|     Information about an ipd entry. |  | ||||||
|     """ |  | ||||||
|     url: str |  | ||||||
|     modification_date: Optional[datetime.datetime] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| IpdDownloadStrategy = Callable[[Organizer, IpdDownloadInfo], bool] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def ipd_download_new_or_modified(organizer: Organizer, info: IpdDownloadInfo) -> bool: |  | ||||||
|     """ |  | ||||||
|     Accepts new files or files with a more recent modification date. |  | ||||||
|     """ |  | ||||||
|     resolved_file = organizer.resolve(info.path) |  | ||||||
|     if not resolved_file.exists(): |  | ||||||
|         return True |  | ||||||
|     if not info.modification_date: |  | ||||||
|         PRETTY.ignored_file(info.path, "could not find modification time, file exists") |  | ||||||
|         return False |  | ||||||
|  |  | ||||||
|     resolved_mod_time_seconds = resolved_file.stat().st_mtime |  | ||||||
|  |  | ||||||
|     # Download if the info is newer |  | ||||||
|     if info.modification_date.timestamp() > resolved_mod_time_seconds: |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     PRETTY.ignored_file(info.path, "local file has newer or equal modification time") |  | ||||||
|     return False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IpdCrawler: |  | ||||||
|     # pylint: disable=too-few-public-methods |  | ||||||
|     """ |  | ||||||
|     A crawler for IPD pages. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, base_url: str): |  | ||||||
|         self._base_url = base_url |  | ||||||
|  |  | ||||||
|     def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: |  | ||||||
|         """ |  | ||||||
|         Create an absolute url from an <a> tag. |  | ||||||
|         """ |  | ||||||
|         return urljoin(self._base_url, link_tag.get("href")) |  | ||||||
|  |  | ||||||
|     def crawl(self) -> List[IpdDownloadInfo]: |  | ||||||
|         """ |  | ||||||
|         Crawls the playlist given in the constructor. |  | ||||||
|         """ |  | ||||||
|         page = soupify(requests.get(self._base_url)) |  | ||||||
|  |  | ||||||
|         items: List[IpdDownloadInfo] = [] |  | ||||||
|  |  | ||||||
|         def is_relevant_url(x: str) -> bool: |  | ||||||
|             return x.endswith(".pdf") or x.endswith(".c") or x.endswith(".java") or x.endswith(".zip") |  | ||||||
|  |  | ||||||
|         for link in page.findAll(name="a", attrs={"href": lambda x: x and is_relevant_url(x)}): |  | ||||||
|             href: str = link.attrs.get("href") |  | ||||||
|             name = href.split("/")[-1] |  | ||||||
|  |  | ||||||
|             modification_date: Optional[datetime.datetime] = None |  | ||||||
|             try: |  | ||||||
|                 enclosing_row: bs4.Tag = link.findParent(name="tr") |  | ||||||
|                 if enclosing_row: |  | ||||||
|                     date_text = enclosing_row.find(name="td").text |  | ||||||
|                     modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") |  | ||||||
|             except ValueError: |  | ||||||
|                 modification_date = None |  | ||||||
|  |  | ||||||
|             items.append(IpdDownloadInfo( |  | ||||||
|                 Path(name), |  | ||||||
|                 url=self._abs_url_from_link(link), |  | ||||||
|                 modification_date=modification_date |  | ||||||
|             )) |  | ||||||
|  |  | ||||||
|         return items |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IpdDownloader: |  | ||||||
|     """ |  | ||||||
|     A downloader for ipd files. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy): |  | ||||||
|         self._tmp_dir = tmp_dir |  | ||||||
|         self._organizer = organizer |  | ||||||
|         self._strategy = strategy |  | ||||||
|         self._session = requests.session() |  | ||||||
|  |  | ||||||
|     def download_all(self, infos: List[IpdDownloadInfo]) -> None: |  | ||||||
|         """ |  | ||||||
|         Download multiple files one after the other. |  | ||||||
|         """ |  | ||||||
|         for info in infos: |  | ||||||
|             self.download(info) |  | ||||||
|  |  | ||||||
|     def download(self, info: IpdDownloadInfo) -> None: |  | ||||||
|         """ |  | ||||||
|         Download a single file. |  | ||||||
|         """ |  | ||||||
|         if not self._strategy(self._organizer, info): |  | ||||||
|             self._organizer.mark(info.path) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         with self._session.get(info.url, stream=True) as response: |  | ||||||
|             if response.status_code == 200: |  | ||||||
|                 tmp_file = self._tmp_dir.new_path() |  | ||||||
|                 stream_to_path(response, tmp_file, info.path.name) |  | ||||||
|                 dst_path = self._organizer.accept_file(tmp_file, info.path) |  | ||||||
|  |  | ||||||
|                 if dst_path and info.modification_date: |  | ||||||
|                     os.utime( |  | ||||||
|                         dst_path, |  | ||||||
|                         times=( |  | ||||||
|                             math.ceil(info.modification_date.timestamp()), |  | ||||||
|                             math.ceil(info.modification_date.timestamp()) |  | ||||||
|                         ) |  | ||||||
|                     ) |  | ||||||
|  |  | ||||||
|             elif response.status_code == 403: |  | ||||||
|                 raise FatalException("Received 403. Are you not using the KIT VPN?") |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Could not download file, got response {response.status_code}") |  | ||||||
| @@ -1,41 +0,0 @@ | |||||||
| """ |  | ||||||
| Contains a Location class for objects with an inherent path. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ResolveException(Exception): |  | ||||||
|     """An exception while resolving a file.""" |  | ||||||
|     # TODO take care of this when doing exception handling |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class Location: |  | ||||||
|     """ |  | ||||||
|     An object that has an inherent path. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, path: Path): |  | ||||||
|         self._path = path.resolve() |  | ||||||
|  |  | ||||||
|     @property |  | ||||||
|     def path(self) -> Path: |  | ||||||
|         """ |  | ||||||
|         This object's location. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         return self._path |  | ||||||
|  |  | ||||||
|     def resolve(self, target: PurePath) -> Path: |  | ||||||
|         """ |  | ||||||
|         Resolve a file relative to the path of this location. |  | ||||||
|  |  | ||||||
|         Raises a [ResolveException] if the file is outside the given directory. |  | ||||||
|         """ |  | ||||||
|         absolute_path = self.path.joinpath(target).resolve() |  | ||||||
|  |  | ||||||
|         # TODO Make this less inefficient |  | ||||||
|         if self.path not in absolute_path.parents: |  | ||||||
|             raise ResolveException(f"Path {target} is not inside directory {self.path}") |  | ||||||
|  |  | ||||||
|         return absolute_path |  | ||||||
							
								
								
									
										184
									
								
								PFERD/logging.py
									
									
									
									
									
								
							
							
						
						
									
										184
									
								
								PFERD/logging.py
									
									
									
									
									
								
							| @@ -1,184 +0,0 @@ | |||||||
| """ |  | ||||||
| Contains a few logger utility functions and implementations. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| from typing import Optional |  | ||||||
|  |  | ||||||
| from rich._log_render import LogRender |  | ||||||
| from rich.console import Console |  | ||||||
| from rich.style import Style |  | ||||||
| from rich.text import Text |  | ||||||
| from rich.theme import Theme |  | ||||||
|  |  | ||||||
| from .download_summary import DownloadSummary |  | ||||||
| from .utils import PathLike, to_path |  | ||||||
|  |  | ||||||
| STYLE = "{" |  | ||||||
| FORMAT = "[{levelname:<7}] {message}" |  | ||||||
| DATE_FORMAT = "%F %T" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def enable_logging(name: str = "PFERD", level: int = logging.INFO) -> None: |  | ||||||
|     """ |  | ||||||
|     Enable and configure logging via the logging module. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     logger = logging.getLogger(name) |  | ||||||
|     logger.setLevel(level) |  | ||||||
|     logger.addHandler(RichLoggingHandler(level=level)) |  | ||||||
|  |  | ||||||
|     # This should be logged by our own handler, and not the root logger's |  | ||||||
|     # default handler, so we don't pass it on to the root logger. |  | ||||||
|     logger.propagate = False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class RichLoggingHandler(logging.Handler): |  | ||||||
|     """ |  | ||||||
|     A logging handler that uses rich for highlighting |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, level: int) -> None: |  | ||||||
|         super().__init__(level=level) |  | ||||||
|         self.console = Console(theme=Theme({ |  | ||||||
|             "logging.level.warning": Style(color="yellow") |  | ||||||
|         })) |  | ||||||
|         self._log_render = LogRender(show_level=True, show_time=False, show_path=False) |  | ||||||
|  |  | ||||||
|     def emit(self, record: logging.LogRecord) -> None: |  | ||||||
|         """ |  | ||||||
|         Invoked by logging. |  | ||||||
|         """ |  | ||||||
|         log_style = f"logging.level.{record.levelname.lower()}" |  | ||||||
|         message = self.format(record) |  | ||||||
|  |  | ||||||
|         level = Text() |  | ||||||
|         level.append(record.levelname, log_style) |  | ||||||
|         message_text = Text.from_markup(message) |  | ||||||
|  |  | ||||||
|         self.console.print( |  | ||||||
|             self._log_render( |  | ||||||
|                 self.console, |  | ||||||
|                 [message_text], |  | ||||||
|                 level=level, |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class PrettyLogger: |  | ||||||
|     """ |  | ||||||
|     A logger that prints some specially formatted log messages in color. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, logger: logging.Logger) -> None: |  | ||||||
|         self.logger = logger |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def _format_path(path: PathLike) -> str: |  | ||||||
|         return repr(str(to_path(path))) |  | ||||||
|  |  | ||||||
|     def error(self, message: str) -> None: |  | ||||||
|         """ |  | ||||||
|         Print an error message indicating some operation fatally failed. |  | ||||||
|         """ |  | ||||||
|         self.logger.error( |  | ||||||
|             f"[bold red]{message}[/bold red]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def warning(self, message: str) -> None: |  | ||||||
|         """ |  | ||||||
|         Print a warning message indicating some operation failed, but the error can be recovered |  | ||||||
|         or ignored. |  | ||||||
|         """ |  | ||||||
|         self.logger.warning( |  | ||||||
|             f"[bold yellow]{message}[/bold yellow]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def modified_file(self, path: PathLike) -> None: |  | ||||||
|         """ |  | ||||||
|         An existing file has changed. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.logger.info( |  | ||||||
|             f"[bold magenta]Modified {self._format_path(path)}.[/bold magenta]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def new_file(self, path: PathLike) -> None: |  | ||||||
|         """ |  | ||||||
|         A new file has been downloaded. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.logger.info( |  | ||||||
|             f"[bold green]Created {self._format_path(path)}.[/bold green]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def deleted_file(self, path: PathLike) -> None: |  | ||||||
|         """ |  | ||||||
|         A file has been deleted. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.logger.info( |  | ||||||
|             f"[bold red]Deleted {self._format_path(path)}.[/bold red]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def ignored_file(self, path: PathLike, reason: str) -> None: |  | ||||||
|         """ |  | ||||||
|         File was not downloaded or modified. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.logger.info( |  | ||||||
|             f"[dim]Ignored {self._format_path(path)} " |  | ||||||
|             f"([/dim]{reason}[dim]).[/dim]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def searching(self, path: PathLike) -> None: |  | ||||||
|         """ |  | ||||||
|         A crawler searches a particular object. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.logger.info(f"Searching {self._format_path(path)}") |  | ||||||
|  |  | ||||||
|     def not_searching(self, path: PathLike, reason: str) -> None: |  | ||||||
|         """ |  | ||||||
|         A crawler does not search a particular object. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.logger.info( |  | ||||||
|             f"[dim]Not searching {self._format_path(path)} " |  | ||||||
|             f"([/dim]{reason}[dim]).[/dim]" |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def summary(self, download_summary: DownloadSummary) -> None: |  | ||||||
|         """ |  | ||||||
|         Prints a download summary. |  | ||||||
|         """ |  | ||||||
|         self.logger.info("") |  | ||||||
|         self.logger.info("[bold cyan]Download Summary[/bold cyan]") |  | ||||||
|         if not download_summary.has_updates(): |  | ||||||
|             self.logger.info("[bold dim]Nothing changed![/bold dim]") |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         for new_file in download_summary.new_files: |  | ||||||
|             self.new_file(new_file) |  | ||||||
|         for modified_file in download_summary.modified_files: |  | ||||||
|             self.modified_file(modified_file) |  | ||||||
|         for deleted_files in download_summary.deleted_files: |  | ||||||
|             self.deleted_file(deleted_files) |  | ||||||
|  |  | ||||||
|     def starting_synchronizer( |  | ||||||
|             self, |  | ||||||
|             target_directory: PathLike, |  | ||||||
|             synchronizer_name: str, |  | ||||||
|             subject: Optional[str] = None, |  | ||||||
|     ) -> None: |  | ||||||
|         """ |  | ||||||
|         A special message marking that a synchronizer has been started. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         subject_str = f"{subject} " if subject else "" |  | ||||||
|         self.logger.info("") |  | ||||||
|         self.logger.info(( |  | ||||||
|             f"[bold cyan]Synchronizing " |  | ||||||
|             f"{subject_str}to {self._format_path(target_directory)} " |  | ||||||
|             f"using the {synchronizer_name} synchronizer.[/bold cyan]" |  | ||||||
|         )) |  | ||||||
| @@ -1,224 +0,0 @@ | |||||||
| """A simple helper for managing downloaded files. |  | ||||||
|  |  | ||||||
| A organizer is bound to a single directory. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import filecmp |  | ||||||
| import logging |  | ||||||
| import os |  | ||||||
| import shutil |  | ||||||
| from enum import Enum |  | ||||||
| from pathlib import Path, PurePath |  | ||||||
| from typing import Callable, List, Optional, Set |  | ||||||
|  |  | ||||||
| from .download_summary import DownloadSummary |  | ||||||
| from .location import Location |  | ||||||
| from .logging import PrettyLogger |  | ||||||
| from .utils import prompt_yes_no |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
| PRETTY = PrettyLogger(LOGGER) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ConflictType(Enum): |  | ||||||
|     """ |  | ||||||
|     The type of the conflict. A file might not exist anymore and will be deleted |  | ||||||
|     or it might be overwritten with a newer version. |  | ||||||
|  |  | ||||||
|     FILE_OVERWRITTEN: An existing file will be updated |  | ||||||
|     MARKED_FILE_OVERWRITTEN: A file is written for the second+ time in this run |  | ||||||
|     FILE_DELETED: The file was deleted |  | ||||||
|     """ |  | ||||||
|     FILE_OVERWRITTEN = "overwritten" |  | ||||||
|     MARKED_FILE_OVERWRITTEN = "marked_file_overwritten" |  | ||||||
|     FILE_DELETED = "deleted" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FileConflictResolution(Enum): |  | ||||||
|     """ |  | ||||||
|     The reaction when confronted with a file conflict: |  | ||||||
|  |  | ||||||
|     DESTROY_EXISTING: Delete/overwrite the current file |  | ||||||
|     KEEP_EXISTING: Keep the current file |  | ||||||
|     DEFAULT: Do whatever the PFERD authors thought is sensible |  | ||||||
|     PROMPT: Interactively ask the user |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     DESTROY_EXISTING = "destroy" |  | ||||||
|  |  | ||||||
|     KEEP_EXISTING = "keep" |  | ||||||
|  |  | ||||||
|     DEFAULT = "default" |  | ||||||
|  |  | ||||||
|     PROMPT = "prompt" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| FileConflictResolver = Callable[[PurePath, ConflictType], FileConflictResolution] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def resolve_prompt_user(_path: PurePath, conflict: ConflictType) -> FileConflictResolution: |  | ||||||
|     """ |  | ||||||
|     Resolves conflicts by asking the user if a file was written twice or will be deleted. |  | ||||||
|     """ |  | ||||||
|     if conflict == ConflictType.FILE_OVERWRITTEN: |  | ||||||
|         return FileConflictResolution.DESTROY_EXISTING |  | ||||||
|     return FileConflictResolution.PROMPT |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FileAcceptException(Exception): |  | ||||||
|     """An exception while accepting a file.""" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class Organizer(Location): |  | ||||||
|     """A helper for managing downloaded files.""" |  | ||||||
|  |  | ||||||
|     def __init__(self, path: Path, conflict_resolver: FileConflictResolver = resolve_prompt_user): |  | ||||||
|         """Create a new organizer for a given path.""" |  | ||||||
|         super().__init__(path) |  | ||||||
|         self._known_files: Set[Path] = set() |  | ||||||
|  |  | ||||||
|         # Keep the root dir |  | ||||||
|         self._known_files.add(path.resolve()) |  | ||||||
|  |  | ||||||
|         self.download_summary = DownloadSummary() |  | ||||||
|  |  | ||||||
|         self.conflict_resolver = conflict_resolver |  | ||||||
|  |  | ||||||
|     def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]: |  | ||||||
|         """ |  | ||||||
|         Move a file to this organizer and mark it. |  | ||||||
|  |  | ||||||
|         Returns the path the file was moved to, to allow the caller to adjust the metadata. |  | ||||||
|         As you might still need to adjust the metadata when the file was identical |  | ||||||
|         (e.g. update the timestamp), the path is also returned in this case. |  | ||||||
|         In all other cases (ignored, not overwritten, etc.) this method returns None. |  | ||||||
|         """ |  | ||||||
|         # Windows limits the path length to 260 for *some* historical reason |  | ||||||
|         # If you want longer paths, you will have to add the "\\?\" prefix in front of |  | ||||||
|         # your path... |  | ||||||
|         # See: |  | ||||||
|         # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation |  | ||||||
|         if os.name == 'nt': |  | ||||||
|             src_absolute = Path("\\\\?\\" + str(src.resolve())) |  | ||||||
|             dst_absolute = Path("\\\\?\\" + str(self.resolve(dst))) |  | ||||||
|         else: |  | ||||||
|             src_absolute = src.resolve() |  | ||||||
|             dst_absolute = self.resolve(dst) |  | ||||||
|  |  | ||||||
|         if not src_absolute.exists(): |  | ||||||
|             raise FileAcceptException("Source file does not exist") |  | ||||||
|  |  | ||||||
|         if not src_absolute.is_file(): |  | ||||||
|             raise FileAcceptException("Source is a directory") |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Copying %s to %s", src_absolute, dst_absolute) |  | ||||||
|  |  | ||||||
|         if self._is_marked(dst): |  | ||||||
|             PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") |  | ||||||
|             conflict = ConflictType.MARKED_FILE_OVERWRITTEN |  | ||||||
|             if self._resolve_conflict("Overwrite file?", dst_absolute, conflict, default=False): |  | ||||||
|                 PRETTY.ignored_file(dst_absolute, "file was written previously") |  | ||||||
|                 return None |  | ||||||
|  |  | ||||||
|         # Destination file is directory |  | ||||||
|         if dst_absolute.exists() and dst_absolute.is_dir(): |  | ||||||
|             prompt = f"Overwrite folder {dst_absolute} with file?" |  | ||||||
|             conflict = ConflictType.FILE_OVERWRITTEN |  | ||||||
|             if self._resolve_conflict(prompt, dst_absolute, conflict, default=False): |  | ||||||
|                 shutil.rmtree(dst_absolute) |  | ||||||
|             else: |  | ||||||
|                 PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") |  | ||||||
|                 return None |  | ||||||
|  |  | ||||||
|         # Destination file exists |  | ||||||
|         if dst_absolute.exists() and dst_absolute.is_file(): |  | ||||||
|             if filecmp.cmp(str(src_absolute), str(dst_absolute), shallow=False): |  | ||||||
|                 # Bail out, nothing more to do |  | ||||||
|                 PRETTY.ignored_file(dst_absolute, "same file contents") |  | ||||||
|                 self.mark(dst) |  | ||||||
|                 return dst_absolute |  | ||||||
|  |  | ||||||
|             prompt = f"Overwrite file {dst_absolute}?" |  | ||||||
|             conflict = ConflictType.FILE_OVERWRITTEN |  | ||||||
|             if not self._resolve_conflict(prompt, dst_absolute, conflict, default=True): |  | ||||||
|                 PRETTY.ignored_file(dst_absolute, "user conflict resolution") |  | ||||||
|                 return None |  | ||||||
|  |  | ||||||
|             self.download_summary.add_modified_file(dst_absolute) |  | ||||||
|             PRETTY.modified_file(dst_absolute) |  | ||||||
|         else: |  | ||||||
|             self.download_summary.add_new_file(dst_absolute) |  | ||||||
|             PRETTY.new_file(dst_absolute) |  | ||||||
|  |  | ||||||
|         # Create parent dir if needed |  | ||||||
|         dst_parent_dir: Path = dst_absolute.parent |  | ||||||
|         dst_parent_dir.mkdir(exist_ok=True, parents=True) |  | ||||||
|  |  | ||||||
|         # Move file |  | ||||||
|         shutil.move(str(src_absolute), str(dst_absolute)) |  | ||||||
|  |  | ||||||
|         self.mark(dst) |  | ||||||
|  |  | ||||||
|         return dst_absolute |  | ||||||
|  |  | ||||||
|     def mark(self, path: PurePath) -> None: |  | ||||||
|         """Mark a file as used so it will not get cleaned up.""" |  | ||||||
|         absolute_path = self.resolve(path) |  | ||||||
|         self._known_files.add(absolute_path) |  | ||||||
|         LOGGER.debug("Tracked %s", absolute_path) |  | ||||||
|  |  | ||||||
|     def _is_marked(self, path: PurePath) -> bool: |  | ||||||
|         """ |  | ||||||
|         Checks whether a file is marked. |  | ||||||
|         """ |  | ||||||
|         absolute_path = self.resolve(path) |  | ||||||
|         return absolute_path in self._known_files |  | ||||||
|  |  | ||||||
|     def cleanup(self) -> None: |  | ||||||
|         """Remove all untracked files in the organizer's dir.""" |  | ||||||
|         LOGGER.debug("Deleting all untracked files...") |  | ||||||
|  |  | ||||||
|         self._cleanup(self.path) |  | ||||||
|  |  | ||||||
|     def _cleanup(self, start_dir: Path) -> None: |  | ||||||
|         if not start_dir.exists(): |  | ||||||
|             return |  | ||||||
|         paths: List[Path] = list(start_dir.iterdir()) |  | ||||||
|  |  | ||||||
|         # Recursively clean paths |  | ||||||
|         for path in paths: |  | ||||||
|             if path.is_dir(): |  | ||||||
|                 self._cleanup(path) |  | ||||||
|             else: |  | ||||||
|                 if path.resolve() not in self._known_files: |  | ||||||
|                     self._delete_file_if_confirmed(path) |  | ||||||
|  |  | ||||||
|         # Delete dir if it was empty and untracked |  | ||||||
|         dir_empty = len(list(start_dir.iterdir())) == 0 |  | ||||||
|         if start_dir.resolve() not in self._known_files and dir_empty: |  | ||||||
|             start_dir.rmdir() |  | ||||||
|  |  | ||||||
|     def _delete_file_if_confirmed(self, path: Path) -> None: |  | ||||||
|         prompt = f"Do you want to delete {path}" |  | ||||||
|  |  | ||||||
|         if self._resolve_conflict(prompt, path, ConflictType.FILE_DELETED, default=False): |  | ||||||
|             self.download_summary.add_deleted_file(path) |  | ||||||
|             path.unlink() |  | ||||||
|         else: |  | ||||||
|             PRETTY.ignored_file(path, "user conflict resolution") |  | ||||||
|  |  | ||||||
|     def _resolve_conflict( |  | ||||||
|             self, prompt: str, path: Path, conflict: ConflictType, default: bool |  | ||||||
|     ) -> bool: |  | ||||||
|         if not self.conflict_resolver: |  | ||||||
|             return prompt_yes_no(prompt, default=default) |  | ||||||
|  |  | ||||||
|         result = self.conflict_resolver(path, conflict) |  | ||||||
|         if result == FileConflictResolution.DEFAULT: |  | ||||||
|             return default |  | ||||||
|         if result == FileConflictResolution.KEEP_EXISTING: |  | ||||||
|             return False |  | ||||||
|         if result == FileConflictResolution.DESTROY_EXISTING: |  | ||||||
|             return True |  | ||||||
|  |  | ||||||
|         return prompt_yes_no(prompt, default=default) |  | ||||||
| @@ -1,111 +0,0 @@ | |||||||
| """ |  | ||||||
| A small progress bar implementation. |  | ||||||
| """ |  | ||||||
| import sys |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from types import TracebackType |  | ||||||
| from typing import Optional, Type |  | ||||||
|  |  | ||||||
| import requests |  | ||||||
| from rich.console import Console |  | ||||||
| from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, |  | ||||||
|                            TextColumn, TimeRemainingColumn, |  | ||||||
|                            TransferSpeedColumn) |  | ||||||
|  |  | ||||||
| _progress: Progress = Progress( |  | ||||||
|     TextColumn("[bold blue]{task.fields[name]}", justify="right"), |  | ||||||
|     BarColumn(bar_width=None), |  | ||||||
|     "[progress.percentage]{task.percentage:>3.1f}%", |  | ||||||
|     "•", |  | ||||||
|     DownloadColumn(), |  | ||||||
|     "•", |  | ||||||
|     TransferSpeedColumn(), |  | ||||||
|     "•", |  | ||||||
|     TimeRemainingColumn(), |  | ||||||
|     console=Console(file=sys.stdout), |  | ||||||
|     transient=True |  | ||||||
| ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def size_from_headers(response: requests.Response) -> Optional[int]: |  | ||||||
|     """ |  | ||||||
|     Return the size of the download based on the response headers. |  | ||||||
|  |  | ||||||
|     Arguments: |  | ||||||
|         response {requests.Response} -- the response |  | ||||||
|  |  | ||||||
|     Returns: |  | ||||||
|         Optional[int] -- the size |  | ||||||
|     """ |  | ||||||
|     if "Content-Length" in response.headers: |  | ||||||
|         return int(response.headers["Content-Length"]) |  | ||||||
|     return None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class ProgressSettings: |  | ||||||
|     """ |  | ||||||
|     Settings you can pass to customize the progress bar. |  | ||||||
|     """ |  | ||||||
|     name: str |  | ||||||
|     max_size: int |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def progress_for(settings: Optional[ProgressSettings]) -> 'ProgressContextManager': |  | ||||||
|     """ |  | ||||||
|     Returns a context manager that displays progress |  | ||||||
|  |  | ||||||
|     Returns: |  | ||||||
|         ProgressContextManager -- the progress manager |  | ||||||
|     """ |  | ||||||
|     return ProgressContextManager(settings) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ProgressContextManager: |  | ||||||
|     """ |  | ||||||
|     A context manager used for displaying progress. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, settings: Optional[ProgressSettings]): |  | ||||||
|         self._settings = settings |  | ||||||
|         self._task_id: Optional[TaskID] = None |  | ||||||
|  |  | ||||||
|     def __enter__(self) -> 'ProgressContextManager': |  | ||||||
|         """Context manager entry function.""" |  | ||||||
|         if not self._settings: |  | ||||||
|             return self |  | ||||||
|  |  | ||||||
|         _progress.start() |  | ||||||
|         self._task_id = _progress.add_task( |  | ||||||
|             self._settings.name, |  | ||||||
|             total=self._settings.max_size, |  | ||||||
|             name=self._settings.name |  | ||||||
|         ) |  | ||||||
|         return self |  | ||||||
|  |  | ||||||
|     # pylint: disable=useless-return |  | ||||||
|     def __exit__( |  | ||||||
|             self, |  | ||||||
|             exc_type: Optional[Type[BaseException]], |  | ||||||
|             exc_value: Optional[BaseException], |  | ||||||
|             traceback: Optional[TracebackType], |  | ||||||
|     ) -> Optional[bool]: |  | ||||||
|         """Context manager exit function. Removes the task.""" |  | ||||||
|         if self._task_id is None: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         _progress.remove_task(self._task_id) |  | ||||||
|  |  | ||||||
|         if len(_progress.task_ids) == 0: |  | ||||||
|             # We need to clean up after ourselves, as we were the last one |  | ||||||
|             _progress.stop() |  | ||||||
|             _progress.refresh() |  | ||||||
|  |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def advance(self, amount: float) -> None: |  | ||||||
|         """ |  | ||||||
|         Advances the progress bar. |  | ||||||
|         """ |  | ||||||
|         if self._task_id is not None: |  | ||||||
|             _progress.advance(self._task_id, amount) |  | ||||||
| @@ -1,79 +0,0 @@ | |||||||
| """Helper functions and classes for temporary folders.""" |  | ||||||
|  |  | ||||||
| import logging |  | ||||||
| import shutil |  | ||||||
| from pathlib import Path |  | ||||||
| from types import TracebackType |  | ||||||
| from typing import Optional, Type |  | ||||||
|  |  | ||||||
| from .location import Location |  | ||||||
|  |  | ||||||
| LOGGER = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TmpDir(Location): |  | ||||||
|     """A temporary folder that can create files or nested temp folders.""" |  | ||||||
|  |  | ||||||
|     def __init__(self, path: Path): |  | ||||||
|         """Create a new temporary folder for the given path.""" |  | ||||||
|         super().__init__(path) |  | ||||||
|         self._counter = 0 |  | ||||||
|         self.cleanup() |  | ||||||
|         self.path.mkdir(parents=True, exist_ok=True) |  | ||||||
|  |  | ||||||
|     def __str__(self) -> str: |  | ||||||
|         """Format the folder as a string.""" |  | ||||||
|         return f"Folder at {self.path}" |  | ||||||
|  |  | ||||||
|     def __enter__(self) -> 'TmpDir': |  | ||||||
|         """Context manager entry function.""" |  | ||||||
|         return self |  | ||||||
|  |  | ||||||
|     # pylint: disable=useless-return |  | ||||||
|     def __exit__( |  | ||||||
|             self, |  | ||||||
|             exc_type: Optional[Type[BaseException]], |  | ||||||
|             exc_value: Optional[BaseException], |  | ||||||
|             traceback: Optional[TracebackType], |  | ||||||
|     ) -> Optional[bool]: |  | ||||||
|         """Context manager exit function. Calls cleanup().""" |  | ||||||
|         self.cleanup() |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def new_path(self, prefix: Optional[str] = None) -> Path: |  | ||||||
|         """ |  | ||||||
|         Return a unique path inside the directory. Doesn't create a file or |  | ||||||
|         directory. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Creating temp file %s", name) |  | ||||||
|  |  | ||||||
|         return self.resolve(Path(name)) |  | ||||||
|  |  | ||||||
|     def new_subdir(self, prefix: Optional[str] = None) -> 'TmpDir': |  | ||||||
|         """ |  | ||||||
|         Create a new nested temporary folder and return it. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" |  | ||||||
|         sub_path = self.resolve(Path(name)) |  | ||||||
|         sub_path.mkdir(parents=True) |  | ||||||
|  |  | ||||||
|         LOGGER.debug("Creating temp dir %s at %s", name, sub_path) |  | ||||||
|  |  | ||||||
|         return TmpDir(sub_path) |  | ||||||
|  |  | ||||||
|     def cleanup(self) -> None: |  | ||||||
|         """Delete this folder and all contained files.""" |  | ||||||
|         LOGGER.debug("Deleting temp folder %s", self.path) |  | ||||||
|  |  | ||||||
|         if self.path.resolve().exists(): |  | ||||||
|             shutil.rmtree(self.path.resolve()) |  | ||||||
|  |  | ||||||
|     def _inc_and_get_counter(self) -> int: |  | ||||||
|         """Get and increment the counter by one.""" |  | ||||||
|         counter = self._counter |  | ||||||
|         self._counter += 1 |  | ||||||
|         return counter |  | ||||||
| @@ -1,142 +0,0 @@ | |||||||
| """ |  | ||||||
| Transforms let the user define functions to decide where the downloaded files |  | ||||||
| should be placed locally. They let the user do more advanced things like moving |  | ||||||
| only files whose names match a regex, or renaming files from one numbering |  | ||||||
| scheme to another. |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import os |  | ||||||
| import re |  | ||||||
| from dataclasses import dataclass |  | ||||||
| from pathlib import PurePath |  | ||||||
| from typing import Callable, List, Optional, TypeVar |  | ||||||
|  |  | ||||||
| from .utils import PathLike, Regex, to_path, to_pattern |  | ||||||
|  |  | ||||||
| Transform = Callable[[PurePath], Optional[PurePath]] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @dataclass |  | ||||||
| class Transformable: |  | ||||||
|     """ |  | ||||||
|     An object that can be transformed by a Transform. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     path: PurePath |  | ||||||
|  |  | ||||||
|  |  | ||||||
| TF = TypeVar("TF", bound=Transformable) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def apply_transform( |  | ||||||
|         transform: Transform, |  | ||||||
|         transformables: List[TF], |  | ||||||
| ) -> List[TF]: |  | ||||||
|     """ |  | ||||||
|     Apply a Transform to multiple Transformables, discarding those that were |  | ||||||
|     not transformed by the Transform. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     result: List[TF] = [] |  | ||||||
|     for transformable in transformables: |  | ||||||
|         new_path = transform(transformable.path) |  | ||||||
|         if new_path: |  | ||||||
|             transformable.path = new_path |  | ||||||
|             result.append(transformable) |  | ||||||
|     return result |  | ||||||
|  |  | ||||||
| # Transform combinators |  | ||||||
|  |  | ||||||
| def keep(path: PurePath) -> Optional[PurePath]: |  | ||||||
|     return path |  | ||||||
|  |  | ||||||
| def attempt(*args: Transform) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         for transform in args: |  | ||||||
|             result = transform(path) |  | ||||||
|             if result: |  | ||||||
|                 return result |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def optionally(transform: Transform) -> Transform: |  | ||||||
|     return attempt(transform, lambda path: path) |  | ||||||
|  |  | ||||||
| def do(*args: Transform) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         current = path |  | ||||||
|         for transform in args: |  | ||||||
|             result = transform(current) |  | ||||||
|             if result: |  | ||||||
|                 current = result |  | ||||||
|             else: |  | ||||||
|                 return None |  | ||||||
|         return current |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def predicate(pred: Callable[[PurePath], bool]) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if pred(path): |  | ||||||
|             return path |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def glob(pattern: str) -> Transform: |  | ||||||
|     return predicate(lambda path: path.match(pattern)) |  | ||||||
|  |  | ||||||
| def move_dir(source_dir: PathLike, target_dir: PathLike) -> Transform: |  | ||||||
|     source_path = to_path(source_dir) |  | ||||||
|     target_path = to_path(target_dir) |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if source_path in path.parents: |  | ||||||
|             return target_path / path.relative_to(source_path) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def move(source: PathLike, target: PathLike) -> Transform: |  | ||||||
|     source_path = to_path(source) |  | ||||||
|     target_path = to_path(target) |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if path == source_path: |  | ||||||
|             return target_path |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def rename(source: str, target: str) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         if path.name == source: |  | ||||||
|             return path.with_name(target) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def re_move(regex: Regex, target: str) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         match = to_pattern(regex).fullmatch(str(path)) |  | ||||||
|         if match: |  | ||||||
|             groups = [match.group(0)] |  | ||||||
|             groups.extend(match.groups()) |  | ||||||
|             return PurePath(target.format(*groups)) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
| def re_rename(regex: Regex, target: str) -> Transform: |  | ||||||
|     def inner(path: PurePath) -> Optional[PurePath]: |  | ||||||
|         match = to_pattern(regex).fullmatch(path.name) |  | ||||||
|         if match: |  | ||||||
|             groups = [match.group(0)] |  | ||||||
|             groups.extend(match.groups()) |  | ||||||
|             return path.with_name(target.format(*groups)) |  | ||||||
|         return None |  | ||||||
|     return inner |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def sanitize_windows_path(path: PurePath) -> PurePath: |  | ||||||
|     """ |  | ||||||
|     A small function to escape characters that are forbidden in windows path names. |  | ||||||
|     This method is a no-op on other operating systems. |  | ||||||
|     """ |  | ||||||
|     # Escape windows illegal path characters |  | ||||||
|     if os.name == 'nt': |  | ||||||
|         sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)] |  | ||||||
|         return PurePath(*sanitized_parts) |  | ||||||
|     return path |  | ||||||
		Reference in New Issue
	
	Block a user
	 Joscha
					Joscha