pferd/PFERD/ilias/downloader.py

174 lines
5.3 KiB
Python
Raw Normal View History

2020-04-20 18:36:40 +02:00
"""Contains a downloader for ILIAS."""
import datetime
import logging
import math
import os
from pathlib import Path, PurePath
from typing import Callable, List, Optional, Union
2020-04-20 18:36:40 +02:00
import bs4
import requests
2021-04-13 11:32:55 +02:00
from ..errors import retry_on_io_exception
2020-04-25 19:59:58 +02:00
from ..logging import PrettyLogger
2020-04-20 18:50:23 +02:00
from ..organizer import Organizer
2020-04-20 18:36:40 +02:00
from ..tmp_dir import TmpDir
2020-04-22 20:25:09 +02:00
from ..transform import Transformable
2020-04-25 19:59:58 +02:00
from ..utils import soupify, stream_to_path
2020-04-20 18:36:40 +02:00
from .authenticators import IliasAuthenticator
LOGGER = logging.getLogger(__name__)
PRETTY = PrettyLogger(LOGGER)
2020-04-20 18:36:40 +02:00
class ContentTypeException(Exception):
"""Thrown when the content type of the ilias element can not be handled."""
2020-04-22 20:25:09 +02:00
class IliasDownloadInfo(Transformable):
2020-04-20 20:06:21 +02:00
"""
This class describes a single file to be downloaded.
"""
def __init__(
self,
path: PurePath,
url: Union[str, Callable[[], Optional[str]]],
modifcation_date: Optional[datetime.datetime]
):
super().__init__(path)
if isinstance(url, str):
string_url = url
self.url: Callable[[], Optional[str]] = lambda: string_url
else:
self.url = url
self.modification_date = modifcation_date
2020-04-20 20:06:21 +02:00
IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool]
def download_everything(organizer: Organizer, info: IliasDownloadInfo) -> bool:
# pylint: disable=unused-argument
"""
Accepts everything.
"""
return True
def download_modified_or_new(organizer: Organizer, info: IliasDownloadInfo) -> bool:
"""
Accepts new files or files with a more recent modification date.
"""
resolved_file = organizer.resolve(info.path)
if not resolved_file.exists() or info.modification_date is None:
return True
resolved_mod_time_seconds = resolved_file.stat().st_mtime
# Download if the info is newer
if info.modification_date.timestamp() > resolved_mod_time_seconds:
return True
2020-04-24 20:24:44 +02:00
PRETTY.ignored_file(info.path, "local file has newer or equal modification time")
return False
2020-04-20 20:06:21 +02:00
class IliasDownloader:
# pylint: disable=too-many-arguments
2020-04-20 18:36:40 +02:00
"""A downloader for ILIAS."""
2020-04-23 11:44:13 +02:00
def __init__(
self,
tmp_dir: TmpDir,
organizer: Organizer,
session: requests.Session,
authenticator: IliasAuthenticator,
strategy: IliasDownloadStrategy,
timeout: int = 5
2020-04-23 11:44:13 +02:00
):
"""
Create a new IliasDownloader.
The timeout applies to the download request only, as bwcloud uses IPv6
and requests has a problem with that: https://github.com/psf/requests/issues/5522
2020-04-23 11:44:13 +02:00
"""
2020-04-20 18:36:40 +02:00
self._tmp_dir = tmp_dir
self._organizer = organizer
2020-04-23 11:44:13 +02:00
self._session = session
self._authenticator = authenticator
self._strategy = strategy
self._timeout = timeout
2020-04-20 18:36:40 +02:00
2020-04-20 20:06:21 +02:00
def download_all(self, infos: List[IliasDownloadInfo]) -> None:
"""
Download multiple files one after the other.
"""
2020-04-20 18:36:40 +02:00
2020-04-20 20:06:21 +02:00
for info in infos:
self.download(info)
def download(self, info: IliasDownloadInfo) -> None:
"""
Download a file from ILIAS.
Retries authentication until eternity if it could not fetch the file.
2020-04-20 18:36:40 +02:00
"""
2020-04-20 20:06:21 +02:00
LOGGER.debug("Downloading %r", info)
2021-04-13 11:32:55 +02:00
if not self._strategy(self._organizer, info):
self._organizer.mark(info.path)
return
2020-04-23 11:44:13 +02:00
tmp_file = self._tmp_dir.new_path()
2020-04-20 18:36:40 +02:00
2021-04-13 11:32:55 +02:00
@retry_on_io_exception(3, "downloading file")
def download_impl() -> bool:
if not self._try_download(info, tmp_file):
LOGGER.info("Re-Authenticating due to download failure: %r", info)
self._authenticator.authenticate(self._session)
raise IOError("Scheduled retry")
else:
return True
if not download_impl():
PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...")
return
2020-04-20 18:36:40 +02:00
dst_path = self._organizer.accept_file(tmp_file, info.path)
if dst_path and info.modification_date:
os.utime(
dst_path,
times=(
math.ceil(info.modification_date.timestamp()),
math.ceil(info.modification_date.timestamp())
)
)
2020-04-20 18:36:40 +02:00
def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
url = info.url()
if url is None:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
2020-05-21 21:57:31 +02:00
return True
with self._session.get(url, stream=True, timeout=self._timeout) as response:
2020-04-20 19:27:26 +02:00
content_type = response.headers["content-type"]
has_content_disposition = "content-disposition" in response.headers
2020-04-20 18:36:40 +02:00
if content_type.startswith("text/html") and not has_content_disposition:
2020-04-20 20:06:21 +02:00
if self._is_logged_in(soupify(response)):
2020-04-20 19:27:26 +02:00
raise ContentTypeException("Attempting to download a web page, not a file")
2020-04-20 18:36:40 +02:00
return False
2020-04-20 19:27:26 +02:00
# Yay, we got the file :)
2020-05-08 00:26:33 +02:00
stream_to_path(response, target, info.path.name)
2020-04-20 19:27:26 +02:00
return True
@staticmethod
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:
2020-04-20 18:36:40 +02:00
userlog = soup.find("li", {"id": "userlog"})
return userlog is not None