Add download strategies to save bandwith

Only download files that are newer than the local version.
This commit is contained in:
I-Al-Istannen 2020-04-23 18:29:20 +02:00
parent 13bc78c889
commit 076b8c5a1f
4 changed files with 93 additions and 4 deletions

View File

@ -7,6 +7,7 @@ more complex configuration, you need to import the other submodules manually.
import logging import logging
from .download_strategies import *
from .pferd import Pferd from .pferd import Pferd
STYLE = "{" STYLE = "{"

View File

@ -0,0 +1,80 @@
"""
Contains a few default strategies for limiting the amount of downloaded files.
"""
import datetime
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional
from typing_extensions import Protocol, runtime_checkable
from .organizer import Organizer
from .utils import PrettyLogger
LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)
@runtime_checkable
class DownloadInfo(Protocol):
# pylint: disable=too-few-public-methods
"""
This class describes some minimal information about a file you can download.
"""
@property
def path(self) -> Path:
"""
Returns the path.
"""
@property
def modification_date(self) -> Optional[datetime.datetime]:
"""
Returns the modification date or None if not known.
"""
class DownloadStrategy(ABC):
# pylint: disable=too-few-public-methods
"""
A strategy deciding whether to download a given info.
"""
@abstractmethod
def should_download(self, organizer: Organizer, info: DownloadInfo) -> bool:
"""
Decides wether a given file should be downloaded.
"""
class DownloadEverythingStrategy(DownloadStrategy):
# pylint: disable=too-few-public-methods
"""
A strategy that redownloads everything.
"""
def should_download(self, organizer: Organizer, info: DownloadInfo) -> bool:
return True
class DownloadNewOrModified(DownloadStrategy):
# pylint: disable=too-few-public-methods
"""
A strategy that only downloads changed or new files.
"""
def should_download(self, organizer: Organizer, info: DownloadInfo) -> bool:
resolved_file = organizer.resolve(info.path)
if not resolved_file.exists() or info.modification_date is None:
return True
resolved_mod_time_seconds = resolved_file.stat().st_mtime
# Download if the info is newer
if info.modification_date.timestamp() > resolved_mod_time_seconds:
return True
PRETTY.filtered_path(info.path, "Local file had newer or equal modification time")
return False

View File

@ -3,7 +3,7 @@
import datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import List from typing import List, Optional
import bs4 import bs4
import requests import requests
@ -26,7 +26,7 @@ class IliasDownloadInfo(Transformable):
""" """
url: str url: str
modification_date: datetime.datetime modification_date: Optional[datetime.datetime]
# parameters: Dict[str, Any] = field(default_factory=dict) # parameters: Dict[str, Any] = field(default_factory=dict)

View File

@ -2,6 +2,7 @@ from pathlib import Path
from typing import Optional from typing import Optional
from .cookie_jar import CookieJar from .cookie_jar import CookieJar
from .download_strategies import DownloadEverythingStrategy, DownloadStrategy
from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter,
IliasDownloader, KitShibbolethAuthenticator) IliasDownloader, KitShibbolethAuthenticator)
from .organizer import Organizer from .organizer import Organizer
@ -9,7 +10,6 @@ from .tmp_dir import TmpDir
from .transform import Transform, apply_transform from .transform import Transform, apply_transform
from .utils import Location from .utils import Location
# TODO save known-good cookies as soon as possible # TODO save known-good cookies as soon as possible
# TODO print synchronizer name before beginning synchronization # TODO print synchronizer name before beginning synchronization
@ -31,6 +31,7 @@ class Pferd(Location):
cookies: Optional[Path], cookies: Optional[Path],
dir_filter: IliasDirectoryFilter, dir_filter: IliasDirectoryFilter,
transform: Transform, transform: Transform,
download_strategy: DownloadStrategy,
) -> None: ) -> None:
cookie_jar = CookieJar(cookies) cookie_jar = CookieJar(cookies)
session = cookie_jar.create_session() session = cookie_jar.create_session()
@ -43,7 +44,12 @@ class Pferd(Location):
cookie_jar.load_cookies() cookie_jar.load_cookies()
info = crawler.crawl() info = crawler.crawl()
cookie_jar.save_cookies() cookie_jar.save_cookies()
downloader.download_all(apply_transform(transform, info)) downloader.download_all(
[
info for info in apply_transform(transform, info)
if download_strategy.should_download(organizer, info)
]
)
cookie_jar.save_cookies() cookie_jar.save_cookies()
def ilias_kit( def ilias_kit(
@ -55,6 +61,7 @@ class Pferd(Location):
cookies: Optional[Path] = None, cookies: Optional[Path] = None,
username: Optional[str] = None, username: Optional[str] = None,
password: Optional[str] = None, password: Optional[str] = None,
download_strategy: DownloadStrategy = DownloadEverythingStrategy(),
) -> None: ) -> None:
# This authenticator only works with the KIT ilias instance. # This authenticator only works with the KIT ilias instance.
authenticator = KitShibbolethAuthenticator(username=username, password=password) authenticator = KitShibbolethAuthenticator(username=username, password=password)
@ -66,4 +73,5 @@ class Pferd(Location):
cookies=cookies, cookies=cookies,
dir_filter=dir_filter, dir_filter=dir_filter,
transform=transform, transform=transform,
download_strategy=download_strategy,
) )