transition from requests to httpx

This commit is contained in:
be7a
2021-04-23 18:02:57 +02:00
parent c1ab7485e2
commit 44aeb6c2eb
14 changed files with 80 additions and 89 deletions

View File

@@ -23,7 +23,7 @@ jobs:
python-version: '3.x'
- name: "Install dependencies"
run: "pip install setuptools keyring pyinstaller rich requests beautifulsoup4 -f --upgrade"
run: "pip install setuptools keyring pyinstaller rich httpx beautifulsoup4 -f --upgrade"
- name: "Install sync_url.py"
run: "pyinstaller sync_url.py -F"

View File

@@ -1,11 +1,11 @@
"""A helper for requests cookies."""
"""A helper for httpx cookies."""
import logging
from http.cookiejar import LoadError, LWPCookieJar
from pathlib import Path
from typing import Optional
import requests
import httpx
LOGGER = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ class CookieJar:
@property
def cookies(self) -> LWPCookieJar:
"""Return the requests cookie jar."""
"""Return the httpx cookie jar."""
return self._cookies
def load_cookies(self) -> None:
@@ -57,13 +57,11 @@ class CookieJar:
# TODO possibly catch a few more exceptions
self._cookies.save(ignore_discard=True)
def create_session(self) -> requests.Session:
"""Create a new session using the cookie jar."""
sess = requests.Session()
def create_client(self) -> httpx.Client:
"""Create a new client using the cookie jar."""
# TODO: timeout=None was the default behaviour of requests. An approprite value should probably be set
client = httpx.Client(timeout=None)
# From the request docs: "All requests code should work out of the box
# with externally provided instances of CookieJar, e.g. LWPCookieJar
# and FileCookieJar."
sess.cookies = self.cookies # type: ignore
client.cookies = self.cookies # type: ignore
return sess
return client

View File

@@ -7,7 +7,7 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, List, Optional
import requests
import httpx
from .errors import FatalException
from .logging import PrettyLogger
@@ -69,7 +69,7 @@ class DivaPlaylistCrawler:
)
base_name = match.group(1)
response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json")
response = httpx.get(cls._PLAYLIST_BASE_URL + base_name + ".json")
if response.status_code != 200:
raise FatalException(
@@ -88,7 +88,7 @@ class DivaPlaylistCrawler:
"""
Crawls the playlist given in the constructor.
"""
response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id})
response = httpx.get(self._COLLECTION_BASE_URL, params={"collection": self._id})
if response.status_code != 200:
raise FatalException(f"Server returned status {response.status_code}.")
@@ -143,7 +143,7 @@ class DivaDownloader:
self._tmp_dir = tmp_dir
self._organizer = organizer
self._strategy = strategy
self._session = requests.session()
self._client = httpx.Client()
def download_all(self, infos: List[DivaDownloadInfo]) -> None:
"""
@@ -160,7 +160,7 @@ class DivaDownloader:
self._organizer.mark(info.path)
return
with self._session.get(info.url, stream=True) as response:
with self._client.stream("GET", info.url) as response:
if response.status_code == 200:
tmp_file = self._tmp_dir.new_path()
stream_to_path(response, tmp_file, info.path.name)

View File

@@ -5,8 +5,8 @@ General downloaders useful in many situations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import requests
import requests.auth
import httpx
import httpx.auth
from .organizer import Organizer
from .tmp_dir import TmpDir
@@ -39,15 +39,15 @@ class HttpDownloader:
self._tmp_dir = tmp_dir
self._username = username
self._password = password
self._session = self._build_session()
self._client = self._build_client()
def _build_session(self) -> requests.Session:
session = requests.Session()
def _build_client(self) -> httpx.Client:
client = httpx.Client()
if self._username and self._password:
session.auth = requests.auth.HTTPBasicAuth(
client.auth = httpx.auth.HTTPBasicAuth(
self._username, self._password
)
return session
return client
def download_all(self, infos: List[HttpDownloadInfo]) -> None:
"""
@@ -62,7 +62,7 @@ class HttpDownloader:
Download a single file.
"""
with self._session.get(info.url, params=info.parameters, stream=True) as response:
with self._client.stream("GET", info.url, params=info.parameters) as response:
if response.status_code == 200:
tmp_file = self._tmp_dir.new_path()
stream_to_path(response, tmp_file, info.path.name)

View File

@@ -7,7 +7,7 @@ import logging
from typing import Optional
import bs4
import requests
import httpx
from ..authenticators import TfaAuthenticator, UserPassAuthenticator
from ..utils import soupify
@@ -19,14 +19,14 @@ class IliasAuthenticator(abc.ABC):
# pylint: disable=too-few-public-methods
"""
An authenticator that logs an existing requests session into an ILIAS
An authenticator that logs an existing httpx client into an ILIAS
account.
"""
@abc.abstractmethod
def authenticate(self, sess: requests.Session) -> None:
def authenticate(self, client: httpx.Client) -> None:
"""
Log a requests session into this authenticator's ILIAS account.
Log a httpx client into this authenticator's ILIAS account.
"""
@@ -45,7 +45,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth")
def authenticate(self, sess: requests.Session) -> None:
def authenticate(self, sess: httpx.Client) -> None:
"""
Performs the ILIAS Shibboleth authentication dance and saves the login
cookies it receieves.
@@ -109,7 +109,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
def _authenticate_tfa(
self,
session: requests.Session,
client: httpx.Client,
soup: bs4.BeautifulSoup
) -> bs4.BeautifulSoup:
# Searching the form here so that this fails before asking for
@@ -125,7 +125,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
"_eventId_proceed": "",
"j_tokenNumber": self._tfa_auth.get_token()
}
return soupify(session.post(url, data=data))
return soupify(client.post(url, data=data))
@staticmethod
def _login_successful(soup: bs4.BeautifulSoup) -> bool:

View File

@@ -13,7 +13,7 @@ from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
urlunsplit)
import bs4
import requests
import httpx
from ..errors import FatalException, retry_on_io_exception
from ..logging import PrettyLogger
@@ -96,7 +96,7 @@ class IliasCrawler:
def __init__(
self,
base_url: str,
session: requests.Session,
client: httpx.Client,
authenticator: IliasAuthenticator,
dir_filter: IliasDirectoryFilter
):
@@ -105,7 +105,7 @@ class IliasCrawler:
"""
self._base_url = base_url
self._session = session
self._client = client
self._authenticator = authenticator
self.dir_filter = dir_filter
@@ -157,9 +157,9 @@ class IliasCrawler:
return self._iterate_entries_to_download_infos(entries)
def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
response: requests.Response = self._session.get(root_url)
response: httpx.Response = self._client.get(root_url)
# We were redirected ==> Non-existant ID
if course_id not in response.url:
if course_id not in str(response.url):
return False
link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link")
@@ -564,7 +564,7 @@ class IliasCrawler:
# on the page, but defined in a JS object inside a script tag, passed to the player
# library.
# We do the impossible and RegEx the stream JSON object out of the page's HTML source
video_page_soup = soupify(self._session.get(play_url))
video_page_soup = soupify(self._client.get(play_url))
regex: re.Pattern = re.compile(
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
)
@@ -639,7 +639,7 @@ class IliasCrawler:
LOGGER.debug("Fetching %r", url)
response = self._session.get(url, params=params)
response = self._client.get(url, params=params)
content_type = response.headers["content-type"]
if not content_type.startswith("text/html"):
@@ -655,7 +655,7 @@ class IliasCrawler:
LOGGER.info("Not authenticated, changing that...")
self._authenticator.authenticate(self._session)
self._authenticator.authenticate(self._client)
return self._get_page(url, params, retry_count + 1)

View File

@@ -8,7 +8,7 @@ from pathlib import Path, PurePath
from typing import Callable, List, Optional, Union
import bs4
import requests
import httpx
from ..errors import retry_on_io_exception
from ..logging import PrettyLogger
@@ -82,21 +82,18 @@ class IliasDownloader:
self,
tmp_dir: TmpDir,
organizer: Organizer,
session: requests.Session,
client: httpx.Client,
authenticator: IliasAuthenticator,
strategy: IliasDownloadStrategy,
timeout: int = 5
):
"""
Create a new IliasDownloader.
The timeout applies to the download request only, as bwcloud uses IPv6
and requests has a problem with that: https://github.com/psf/requests/issues/5522
"""
self._tmp_dir = tmp_dir
self._organizer = organizer
self._session = session
self._client = client
self._authenticator = authenticator
self._strategy = strategy
self._timeout = timeout
@@ -128,7 +125,7 @@ class IliasDownloader:
def download_impl() -> bool:
if not self._try_download(info, tmp_file):
LOGGER.info("Re-Authenticating due to download failure: %r", info)
self._authenticator.authenticate(self._session)
self._authenticator.authenticate(self._client)
raise IOError("Scheduled retry")
else:
return True
@@ -153,7 +150,7 @@ class IliasDownloader:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
return True
with self._session.get(url, stream=True, timeout=self._timeout) as response:
with self._client.stream("GET", url, timeout=self._timeout) as response:
content_type = response.headers["content-type"]
has_content_disposition = "content-disposition" in response.headers

View File

@@ -11,7 +11,7 @@ from typing import Callable, List, Optional
from urllib.parse import urljoin
import bs4
import requests
import httpx
from PFERD.errors import FatalException
from PFERD.utils import soupify
@@ -78,7 +78,7 @@ class IpdCrawler:
"""
Crawls the playlist given in the constructor.
"""
page = soupify(requests.get(self._base_url))
page = soupify(httpx.get(self._base_url))
items: List[IpdDownloadInfo] = []
@@ -116,7 +116,7 @@ class IpdDownloader:
self._tmp_dir = tmp_dir
self._organizer = organizer
self._strategy = strategy
self._session = requests.session()
self._client = httpx.Client()
def download_all(self, infos: List[IpdDownloadInfo]) -> None:
"""
@@ -133,7 +133,7 @@ class IpdDownloader:
self._organizer.mark(info.path)
return
with self._session.get(info.url, stream=True) as response:
with self._client.stream("GET", info.url) as response:
if response.status_code == 200:
tmp_file = self._tmp_dir.new_path()
stream_to_path(response, tmp_file, info.path.name)

View File

@@ -88,12 +88,12 @@ class Pferd(Location):
) -> Organizer:
# pylint: disable=too-many-locals
cookie_jar = CookieJar(to_path(cookies) if cookies else None)
session = cookie_jar.create_session()
client = cookie_jar.create_client()
tmp_dir = self._tmp_dir.new_subdir()
organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver)
crawler = IliasCrawler(base_url, session, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, session,
crawler = IliasCrawler(base_url, client, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, client,
authenticator, download_strategy, timeout)
cookie_jar.load_cookies()
@@ -149,11 +149,11 @@ class Pferd(Location):
password {Optional[str]} -- The SCC password. If none is given, it will prompt
the user. (default: {None})
download_strategy {DownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
be downloaded. Can save bandwidth and reduce the number of httpx.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
httpx bug.
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
with overwriting or deleting files. The default always asks the user.
"""
@@ -222,8 +222,7 @@ class Pferd(Location):
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
timeout {int} -- The download timeout for opencast videos.
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
with overwriting or deleting files. The default always asks the user.
"""
@@ -284,11 +283,11 @@ class Pferd(Location):
password {Optional[str]} -- The SCC password. If none is given, it will prompt
the user. (default: {None})
download_strategy {DownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
be downloaded. Can save bandwidth and reduce the number of httpx.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
httpx bug.
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
with overwriting or deleting files. The default always asks the user.
"""
@@ -338,7 +337,7 @@ class Pferd(Location):
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
download_strategy {DivaDownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
be downloaded. Can save bandwidth and reduce the number of httpx.
(default: {diva_download_new})
clean {bool} -- Whether to clean up when the method finishes.
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
@@ -396,7 +395,7 @@ class Pferd(Location):
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
download_strategy {DivaDownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
be downloaded. Can save bandwidth and reduce the number of httpx.
(default: {diva_download_new})
clean {bool} -- Whether to clean up when the method finishes.
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal

View File

@@ -6,7 +6,7 @@ from dataclasses import dataclass
from types import TracebackType
from typing import Optional, Type
import requests
import httpx
from rich.console import Console
from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID,
TextColumn, TimeRemainingColumn,
@@ -27,12 +27,12 @@ _progress: Progress = Progress(
)
def size_from_headers(response: requests.Response) -> Optional[int]:
def size_from_headers(response: httpx.Response) -> Optional[int]:
"""
Return the size of the download based on the response headers.
Arguments:
response {requests.Response} -- the response
response {httpx.Response} -- the response
Returns:
Optional[int] -- the size

View File

@@ -7,7 +7,7 @@ from pathlib import Path, PurePath
from typing import Optional, Tuple, Union
import bs4
import requests
import httpx
from .progress import ProgressSettings, progress_for, size_from_headers
@@ -35,41 +35,38 @@ def to_pattern(regex: Regex) -> re.Pattern:
return re.compile(regex)
def soupify(response: requests.Response) -> bs4.BeautifulSoup:
def soupify(response: httpx.Response) -> bs4.BeautifulSoup:
"""
Wrap a requests response in a bs4 object.
Wrap a httpx response in a bs4 object.
"""
return bs4.BeautifulSoup(response.text, "html.parser")
def stream_to_path(
response: requests.Response,
response: httpx.Response,
target: Path,
progress_name: Optional[str] = None,
chunk_size: int = 1024 ** 2
) -> None:
"""
Download a requests response content to a file by streaming it. This
function avoids excessive memory usage when downloading large files. The
chunk_size is in bytes.
Download a httpx response content to a file by streaming it. This
function avoids excessive memory usage when downloading large files.
If progress_name is None, no progress bar will be shown. Otherwise a progress
bar will appear, if the download is bigger than an internal threshold.
"""
with response:
length = size_from_headers(response)
if progress_name and length and int(length) > 1024 * 1024 * 10: # 10 MiB
settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length)
else:
settings = None
length = size_from_headers(response)
if progress_name and length and int(length) > 1024 * 1024 * 10: # 10 MiB
settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length)
else:
settings = None
with open(target, 'wb') as file_descriptor:
with progress_for(settings) as progress:
for chunk in response.iter_content(chunk_size=chunk_size):
file_descriptor.write(chunk)
progress.advance(len(chunk))
with open(target, 'wb') as file_descriptor:
with progress_for(settings) as progress:
for chunk in response.iter_bytes():
file_descriptor.write(chunk)
progress.advance(len(chunk))
def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool:

View File

@@ -1,4 +1,4 @@
requests>=2.21.0
httpx>=0.17.1
beautifulsoup4>=4.7.1
rich>=2.1.0
keyring>=21.5.0

View File

@@ -5,7 +5,7 @@ setup(
version="2.6.1",
packages=find_packages(),
install_requires=[
"requests>=2.21.0",
"httpx>=0.17.1",
"beautifulsoup4>=4.7.1",
"rich>=2.1.0",
"keyring>=21.5.0"

View File

@@ -86,7 +86,7 @@ def main() -> None:
args = parser.parse_args()
cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
session = cookie_jar.create_session()
client = cookie_jar.create_client()
if args.keyring:
if not args.username:
@@ -103,7 +103,7 @@ def main() -> None:
url = urlparse(args.url)
crawler = IliasCrawler(url.scheme + '://' + url.netloc, session,
crawler = IliasCrawler(url.scheme + '://' + url.netloc, client,
authenticator, lambda x, y: True)
cookie_jar.load_cookies()