From 076b8c5a1f093f74a6b211bf0d7a81ef70b0687a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 23 Apr 2020 18:29:20 +0200 Subject: [PATCH] Add download strategies to save bandwith Only download files that are newer than the local version. --- PFERD/__init__.py | 1 + PFERD/download_strategies.py | 80 ++++++++++++++++++++++++++++++++++++ PFERD/ilias/downloader.py | 4 +- PFERD/pferd.py | 12 +++++- 4 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 PFERD/download_strategies.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 87131ce..9759e22 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -7,6 +7,7 @@ more complex configuration, you need to import the other submodules manually. import logging +from .download_strategies import * from .pferd import Pferd STYLE = "{" diff --git a/PFERD/download_strategies.py b/PFERD/download_strategies.py new file mode 100644 index 0000000..974fa61 --- /dev/null +++ b/PFERD/download_strategies.py @@ -0,0 +1,80 @@ +""" +Contains a few default strategies for limiting the amount of downloaded files. +""" + +import datetime +import logging +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +from typing_extensions import Protocol, runtime_checkable + +from .organizer import Organizer +from .utils import PrettyLogger + +LOGGER = logging.getLogger(__name__) +PRETTY = PrettyLogger(LOGGER) + + +@runtime_checkable +class DownloadInfo(Protocol): + # pylint: disable=too-few-public-methods + """ + This class describes some minimal information about a file you can download. + """ + + @property + def path(self) -> Path: + """ + Returns the path. + """ + + @property + def modification_date(self) -> Optional[datetime.datetime]: + """ + Returns the modification date or None if not known. + """ + + +class DownloadStrategy(ABC): + # pylint: disable=too-few-public-methods + """ + A strategy deciding whether to download a given info. + """ + + @abstractmethod + def should_download(self, organizer: Organizer, info: DownloadInfo) -> bool: + """ + Decides wether a given file should be downloaded. + """ + + +class DownloadEverythingStrategy(DownloadStrategy): + # pylint: disable=too-few-public-methods + """ + A strategy that redownloads everything. + """ + + def should_download(self, organizer: Organizer, info: DownloadInfo) -> bool: + return True + + +class DownloadNewOrModified(DownloadStrategy): + # pylint: disable=too-few-public-methods + """ + A strategy that only downloads changed or new files. + """ + + def should_download(self, organizer: Organizer, info: DownloadInfo) -> bool: + resolved_file = organizer.resolve(info.path) + if not resolved_file.exists() or info.modification_date is None: + return True + resolved_mod_time_seconds = resolved_file.stat().st_mtime + + # Download if the info is newer + if info.modification_date.timestamp() > resolved_mod_time_seconds: + return True + + PRETTY.filtered_path(info.path, "Local file had newer or equal modification time") + return False diff --git a/PFERD/ilias/downloader.py b/PFERD/ilias/downloader.py index 88e6792..9a6c785 100644 --- a/PFERD/ilias/downloader.py +++ b/PFERD/ilias/downloader.py @@ -3,7 +3,7 @@ import datetime from dataclasses import dataclass from pathlib import Path -from typing import List +from typing import List, Optional import bs4 import requests @@ -26,7 +26,7 @@ class IliasDownloadInfo(Transformable): """ url: str - modification_date: datetime.datetime + modification_date: Optional[datetime.datetime] # parameters: Dict[str, Any] = field(default_factory=dict) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index ff2743f..de61da5 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Optional from .cookie_jar import CookieJar +from .download_strategies import DownloadEverythingStrategy, DownloadStrategy from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, IliasDownloader, KitShibbolethAuthenticator) from .organizer import Organizer @@ -9,7 +10,6 @@ from .tmp_dir import TmpDir from .transform import Transform, apply_transform from .utils import Location - # TODO save known-good cookies as soon as possible # TODO print synchronizer name before beginning synchronization @@ -31,6 +31,7 @@ class Pferd(Location): cookies: Optional[Path], dir_filter: IliasDirectoryFilter, transform: Transform, + download_strategy: DownloadStrategy, ) -> None: cookie_jar = CookieJar(cookies) session = cookie_jar.create_session() @@ -43,7 +44,12 @@ class Pferd(Location): cookie_jar.load_cookies() info = crawler.crawl() cookie_jar.save_cookies() - downloader.download_all(apply_transform(transform, info)) + downloader.download_all( + [ + info for info in apply_transform(transform, info) + if download_strategy.should_download(organizer, info) + ] + ) cookie_jar.save_cookies() def ilias_kit( @@ -55,6 +61,7 @@ class Pferd(Location): cookies: Optional[Path] = None, username: Optional[str] = None, password: Optional[str] = None, + download_strategy: DownloadStrategy = DownloadEverythingStrategy(), ) -> None: # This authenticator only works with the KIT ilias instance. authenticator = KitShibbolethAuthenticator(username=username, password=password) @@ -66,4 +73,5 @@ class Pferd(Location): cookies=cookies, dir_filter=dir_filter, transform=transform, + download_strategy=download_strategy, )