pferd/PFERD/downloaders.py

73 lines
2.0 KiB
Python
Raw Permalink Normal View History

2020-04-20 17:08:51 +00:00
"""
General downloaders useful in many situations
"""
2020-04-20 17:43:41 +02:00
2020-04-20 18:06:21 +00:00
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
2020-04-20 17:43:41 +02:00
import requests
import requests.auth
2020-04-20 18:50:23 +02:00
from .organizer import Organizer
2020-04-20 17:43:41 +02:00
from .tmp_dir import TmpDir
2020-04-22 18:25:09 +00:00
from .transform import Transformable
2020-04-20 17:43:41 +02:00
from .utils import stream_to_path
2020-04-20 18:06:21 +00:00
@dataclass
2020-04-22 18:25:09 +00:00
class HttpDownloadInfo(Transformable):
2020-04-20 18:06:21 +00:00
"""
This class describes a single file to be downloaded.
"""
url: str
parameters: Dict[str, Any] = field(default_factory=dict)
class HttpDownloader:
2020-04-20 17:43:41 +02:00
"""A HTTP downloader that can handle HTTP basic auth."""
2020-04-20 17:08:51 +00:00
def __init__(
self,
tmp_dir: TmpDir,
organizer: Organizer,
username: Optional[str],
password: Optional[str],
):
2020-04-20 17:43:41 +02:00
"""Create a new http downloader."""
self._organizer = organizer
self._tmp_dir = tmp_dir
self._username = username
self._password = password
self._session = self._build_session()
def _build_session(self) -> requests.Session:
session = requests.Session()
if self._username and self._password:
session.auth = requests.auth.HTTPBasicAuth(
self._username, self._password
)
return session
2020-04-20 18:06:21 +00:00
def download_all(self, infos: List[HttpDownloadInfo]) -> None:
"""
Download multiple files one after the other.
"""
for info in infos:
self.download(info)
def download(self, info: HttpDownloadInfo) -> None:
"""
Download a single file.
"""
with self._session.get(info.url, params=info.parameters, stream=True) as response:
2020-04-20 18:04:56 +02:00
if response.status_code == 200:
2020-04-23 09:44:13 +00:00
tmp_file = self._tmp_dir.new_path()
2020-05-08 00:26:33 +02:00
stream_to_path(response, tmp_file, info.path.name)
2020-04-20 18:06:21 +00:00
self._organizer.accept_file(tmp_file, info.path)
2020-04-20 18:04:56 +02:00
else:
2020-04-20 18:06:21 +00:00
# TODO use proper exception
raise Exception(f"Could not download file, got response {response.status_code}")