From 44aeb6c2eb47f3601f83a42d2a7b6fe46e7aa664 Mon Sep 17 00:00:00 2001 From: be7a Date: Fri, 23 Apr 2021 18:02:57 +0200 Subject: [PATCH] transition from requests to httpx --- .github/workflows/package.yml | 2 +- PFERD/cookie_jar.py | 20 +++++++++----------- PFERD/diva.py | 10 +++++----- PFERD/downloaders.py | 16 ++++++++-------- PFERD/ilias/authenticators.py | 14 +++++++------- PFERD/ilias/crawler.py | 16 ++++++++-------- PFERD/ilias/downloader.py | 13 +++++-------- PFERD/ipd.py | 8 ++++---- PFERD/pferd.py | 21 ++++++++++----------- PFERD/progress.py | 6 +++--- PFERD/utils.py | 35 ++++++++++++++++------------------- requirements.txt | 2 +- setup.py | 2 +- sync_url.py | 4 ++-- 14 files changed, 80 insertions(+), 89 deletions(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 615917b..6564894 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -23,7 +23,7 @@ jobs: python-version: '3.x' - name: "Install dependencies" - run: "pip install setuptools keyring pyinstaller rich requests beautifulsoup4 -f --upgrade" + run: "pip install setuptools keyring pyinstaller rich httpx beautifulsoup4 -f --upgrade" - name: "Install sync_url.py" run: "pyinstaller sync_url.py -F" diff --git a/PFERD/cookie_jar.py b/PFERD/cookie_jar.py index e5b568f..754979c 100644 --- a/PFERD/cookie_jar.py +++ b/PFERD/cookie_jar.py @@ -1,11 +1,11 @@ -"""A helper for requests cookies.""" +"""A helper for httpx cookies.""" import logging from http.cookiejar import LoadError, LWPCookieJar from pathlib import Path from typing import Optional -import requests +import httpx LOGGER = logging.getLogger(__name__) @@ -26,7 +26,7 @@ class CookieJar: @property def cookies(self) -> LWPCookieJar: - """Return the requests cookie jar.""" + """Return the httpx cookie jar.""" return self._cookies def load_cookies(self) -> None: @@ -57,13 +57,11 @@ class CookieJar: # TODO possibly catch a few more exceptions self._cookies.save(ignore_discard=True) - def create_session(self) -> requests.Session: - """Create a new session using the cookie jar.""" - sess = requests.Session() + def create_client(self) -> httpx.Client: + """Create a new client using the cookie jar.""" + # TODO: timeout=None was the default behaviour of requests. An approprite value should probably be set + client = httpx.Client(timeout=None) - # From the request docs: "All requests code should work out of the box - # with externally provided instances of CookieJar, e.g. LWPCookieJar - # and FileCookieJar." - sess.cookies = self.cookies # type: ignore + client.cookies = self.cookies # type: ignore - return sess + return client diff --git a/PFERD/diva.py b/PFERD/diva.py index 148fa56..a6bdba0 100644 --- a/PFERD/diva.py +++ b/PFERD/diva.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, List, Optional -import requests +import httpx from .errors import FatalException from .logging import PrettyLogger @@ -69,7 +69,7 @@ class DivaPlaylistCrawler: ) base_name = match.group(1) - response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json") + response = httpx.get(cls._PLAYLIST_BASE_URL + base_name + ".json") if response.status_code != 200: raise FatalException( @@ -88,7 +88,7 @@ class DivaPlaylistCrawler: """ Crawls the playlist given in the constructor. """ - response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) + response = httpx.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) if response.status_code != 200: raise FatalException(f"Server returned status {response.status_code}.") @@ -143,7 +143,7 @@ class DivaDownloader: self._tmp_dir = tmp_dir self._organizer = organizer self._strategy = strategy - self._session = requests.session() + self._client = httpx.Client() def download_all(self, infos: List[DivaDownloadInfo]) -> None: """ @@ -160,7 +160,7 @@ class DivaDownloader: self._organizer.mark(info.path) return - with self._session.get(info.url, stream=True) as response: + with self._client.stream("GET", info.url) as response: if response.status_code == 200: tmp_file = self._tmp_dir.new_path() stream_to_path(response, tmp_file, info.path.name) diff --git a/PFERD/downloaders.py b/PFERD/downloaders.py index 94b8b9f..f004450 100644 --- a/PFERD/downloaders.py +++ b/PFERD/downloaders.py @@ -5,8 +5,8 @@ General downloaders useful in many situations from dataclasses import dataclass, field from typing import Any, Dict, List, Optional -import requests -import requests.auth +import httpx +import httpx.auth from .organizer import Organizer from .tmp_dir import TmpDir @@ -39,15 +39,15 @@ class HttpDownloader: self._tmp_dir = tmp_dir self._username = username self._password = password - self._session = self._build_session() + self._client = self._build_client() - def _build_session(self) -> requests.Session: - session = requests.Session() + def _build_client(self) -> httpx.Client: + client = httpx.Client() if self._username and self._password: - session.auth = requests.auth.HTTPBasicAuth( + client.auth = httpx.auth.HTTPBasicAuth( self._username, self._password ) - return session + return client def download_all(self, infos: List[HttpDownloadInfo]) -> None: """ @@ -62,7 +62,7 @@ class HttpDownloader: Download a single file. """ - with self._session.get(info.url, params=info.parameters, stream=True) as response: + with self._client.stream("GET", info.url, params=info.parameters) as response: if response.status_code == 200: tmp_file = self._tmp_dir.new_path() stream_to_path(response, tmp_file, info.path.name) diff --git a/PFERD/ilias/authenticators.py b/PFERD/ilias/authenticators.py index 4b99dd8..c1f4087 100644 --- a/PFERD/ilias/authenticators.py +++ b/PFERD/ilias/authenticators.py @@ -7,7 +7,7 @@ import logging from typing import Optional import bs4 -import requests +import httpx from ..authenticators import TfaAuthenticator, UserPassAuthenticator from ..utils import soupify @@ -19,14 +19,14 @@ class IliasAuthenticator(abc.ABC): # pylint: disable=too-few-public-methods """ - An authenticator that logs an existing requests session into an ILIAS + An authenticator that logs an existing httpx client into an ILIAS account. """ @abc.abstractmethod - def authenticate(self, sess: requests.Session) -> None: + def authenticate(self, client: httpx.Client) -> None: """ - Log a requests session into this authenticator's ILIAS account. + Log a httpx client into this authenticator's ILIAS account. """ @@ -45,7 +45,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator): self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") - def authenticate(self, sess: requests.Session) -> None: + def authenticate(self, sess: httpx.Client) -> None: """ Performs the ILIAS Shibboleth authentication dance and saves the login cookies it receieves. @@ -109,7 +109,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator): def _authenticate_tfa( self, - session: requests.Session, + client: httpx.Client, soup: bs4.BeautifulSoup ) -> bs4.BeautifulSoup: # Searching the form here so that this fails before asking for @@ -125,7 +125,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator): "_eventId_proceed": "", "j_tokenNumber": self._tfa_auth.get_token() } - return soupify(session.post(url, data=data)) + return soupify(client.post(url, data=data)) @staticmethod def _login_successful(soup: bs4.BeautifulSoup) -> bool: diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index edab284..9726a4f 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -13,7 +13,7 @@ from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit) import bs4 -import requests +import httpx from ..errors import FatalException, retry_on_io_exception from ..logging import PrettyLogger @@ -96,7 +96,7 @@ class IliasCrawler: def __init__( self, base_url: str, - session: requests.Session, + client: httpx.Client, authenticator: IliasAuthenticator, dir_filter: IliasDirectoryFilter ): @@ -105,7 +105,7 @@ class IliasCrawler: """ self._base_url = base_url - self._session = session + self._client = client self._authenticator = authenticator self.dir_filter = dir_filter @@ -157,9 +157,9 @@ class IliasCrawler: return self._iterate_entries_to_download_infos(entries) def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: - response: requests.Response = self._session.get(root_url) + response: httpx.Response = self._client.get(root_url) # We were redirected ==> Non-existant ID - if course_id not in response.url: + if course_id not in str(response.url): return False link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link") @@ -564,7 +564,7 @@ class IliasCrawler: # on the page, but defined in a JS object inside a script tag, passed to the player # library. # We do the impossible and RegEx the stream JSON object out of the page's HTML source - video_page_soup = soupify(self._session.get(play_url)) + video_page_soup = soupify(self._client.get(play_url)) regex: re.Pattern = re.compile( r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE ) @@ -639,7 +639,7 @@ class IliasCrawler: LOGGER.debug("Fetching %r", url) - response = self._session.get(url, params=params) + response = self._client.get(url, params=params) content_type = response.headers["content-type"] if not content_type.startswith("text/html"): @@ -655,7 +655,7 @@ class IliasCrawler: LOGGER.info("Not authenticated, changing that...") - self._authenticator.authenticate(self._session) + self._authenticator.authenticate(self._client) return self._get_page(url, params, retry_count + 1) diff --git a/PFERD/ilias/downloader.py b/PFERD/ilias/downloader.py index f6132bf..005dfec 100644 --- a/PFERD/ilias/downloader.py +++ b/PFERD/ilias/downloader.py @@ -8,7 +8,7 @@ from pathlib import Path, PurePath from typing import Callable, List, Optional, Union import bs4 -import requests +import httpx from ..errors import retry_on_io_exception from ..logging import PrettyLogger @@ -82,21 +82,18 @@ class IliasDownloader: self, tmp_dir: TmpDir, organizer: Organizer, - session: requests.Session, + client: httpx.Client, authenticator: IliasAuthenticator, strategy: IliasDownloadStrategy, timeout: int = 5 ): """ Create a new IliasDownloader. - - The timeout applies to the download request only, as bwcloud uses IPv6 - and requests has a problem with that: https://github.com/psf/requests/issues/5522 """ self._tmp_dir = tmp_dir self._organizer = organizer - self._session = session + self._client = client self._authenticator = authenticator self._strategy = strategy self._timeout = timeout @@ -128,7 +125,7 @@ class IliasDownloader: def download_impl() -> bool: if not self._try_download(info, tmp_file): LOGGER.info("Re-Authenticating due to download failure: %r", info) - self._authenticator.authenticate(self._session) + self._authenticator.authenticate(self._client) raise IOError("Scheduled retry") else: return True @@ -153,7 +150,7 @@ class IliasDownloader: PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") return True - with self._session.get(url, stream=True, timeout=self._timeout) as response: + with self._client.stream("GET", url, timeout=self._timeout) as response: content_type = response.headers["content-type"] has_content_disposition = "content-disposition" in response.headers diff --git a/PFERD/ipd.py b/PFERD/ipd.py index ece6a97..336b21c 100644 --- a/PFERD/ipd.py +++ b/PFERD/ipd.py @@ -11,7 +11,7 @@ from typing import Callable, List, Optional from urllib.parse import urljoin import bs4 -import requests +import httpx from PFERD.errors import FatalException from PFERD.utils import soupify @@ -78,7 +78,7 @@ class IpdCrawler: """ Crawls the playlist given in the constructor. """ - page = soupify(requests.get(self._base_url)) + page = soupify(httpx.get(self._base_url)) items: List[IpdDownloadInfo] = [] @@ -116,7 +116,7 @@ class IpdDownloader: self._tmp_dir = tmp_dir self._organizer = organizer self._strategy = strategy - self._session = requests.session() + self._client = httpx.Client() def download_all(self, infos: List[IpdDownloadInfo]) -> None: """ @@ -133,7 +133,7 @@ class IpdDownloader: self._organizer.mark(info.path) return - with self._session.get(info.url, stream=True) as response: + with self._client.stream("GET", info.url) as response: if response.status_code == 200: tmp_file = self._tmp_dir.new_path() stream_to_path(response, tmp_file, info.path.name) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 1bb6f78..3efe8f2 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -88,12 +88,12 @@ class Pferd(Location): ) -> Organizer: # pylint: disable=too-many-locals cookie_jar = CookieJar(to_path(cookies) if cookies else None) - session = cookie_jar.create_session() + client = cookie_jar.create_client() tmp_dir = self._tmp_dir.new_subdir() organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) - crawler = IliasCrawler(base_url, session, authenticator, dir_filter) - downloader = IliasDownloader(tmp_dir, organizer, session, + crawler = IliasCrawler(base_url, client, authenticator, dir_filter) + downloader = IliasDownloader(tmp_dir, organizer, client, authenticator, download_strategy, timeout) cookie_jar.load_cookies() @@ -149,11 +149,11 @@ class Pferd(Location): password {Optional[str]} -- The SCC password. If none is given, it will prompt the user. (default: {None}) download_strategy {DownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. + be downloaded. Can save bandwidth and reduce the number of httpx. (default: {download_modified_or_new}) clean {bool} -- Whether to clean up when the method finishes. timeout {int} -- The download timeout for opencast videos. Sadly needed due to a - requests bug. + httpx bug. file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal with overwriting or deleting files. The default always asks the user. """ @@ -222,8 +222,7 @@ class Pferd(Location): be downloaded. Can save bandwidth and reduce the number of requests. (default: {download_modified_or_new}) clean {bool} -- Whether to clean up when the method finishes. - timeout {int} -- The download timeout for opencast videos. Sadly needed due to a - requests bug. + timeout {int} -- The download timeout for opencast videos. file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal with overwriting or deleting files. The default always asks the user. """ @@ -284,11 +283,11 @@ class Pferd(Location): password {Optional[str]} -- The SCC password. If none is given, it will prompt the user. (default: {None}) download_strategy {DownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. + be downloaded. Can save bandwidth and reduce the number of httpx. (default: {download_modified_or_new}) clean {bool} -- Whether to clean up when the method finishes. timeout {int} -- The download timeout for opencast videos. Sadly needed due to a - requests bug. + httpx bug. file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal with overwriting or deleting files. The default always asks the user. """ @@ -338,7 +337,7 @@ class Pferd(Location): transform {Transform} -- A transformation function for the output paths. Return None to ignore a file. (default: {lambdax:x}) download_strategy {DivaDownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. + be downloaded. Can save bandwidth and reduce the number of httpx. (default: {diva_download_new}) clean {bool} -- Whether to clean up when the method finishes. file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal @@ -396,7 +395,7 @@ class Pferd(Location): transform {Transform} -- A transformation function for the output paths. Return None to ignore a file. (default: {lambdax:x}) download_strategy {DivaDownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. + be downloaded. Can save bandwidth and reduce the number of httpx. (default: {diva_download_new}) clean {bool} -- Whether to clean up when the method finishes. file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal diff --git a/PFERD/progress.py b/PFERD/progress.py index 6ad098f..06cc378 100644 --- a/PFERD/progress.py +++ b/PFERD/progress.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from types import TracebackType from typing import Optional, Type -import requests +import httpx from rich.console import Console from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, @@ -27,12 +27,12 @@ _progress: Progress = Progress( ) -def size_from_headers(response: requests.Response) -> Optional[int]: +def size_from_headers(response: httpx.Response) -> Optional[int]: """ Return the size of the download based on the response headers. Arguments: - response {requests.Response} -- the response + response {httpx.Response} -- the response Returns: Optional[int] -- the size diff --git a/PFERD/utils.py b/PFERD/utils.py index 56c101a..9b841d0 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -7,7 +7,7 @@ from pathlib import Path, PurePath from typing import Optional, Tuple, Union import bs4 -import requests +import httpx from .progress import ProgressSettings, progress_for, size_from_headers @@ -35,41 +35,38 @@ def to_pattern(regex: Regex) -> re.Pattern: return re.compile(regex) -def soupify(response: requests.Response) -> bs4.BeautifulSoup: +def soupify(response: httpx.Response) -> bs4.BeautifulSoup: """ - Wrap a requests response in a bs4 object. + Wrap a httpx response in a bs4 object. """ return bs4.BeautifulSoup(response.text, "html.parser") def stream_to_path( - response: requests.Response, + response: httpx.Response, target: Path, progress_name: Optional[str] = None, - chunk_size: int = 1024 ** 2 ) -> None: """ - Download a requests response content to a file by streaming it. This - function avoids excessive memory usage when downloading large files. The - chunk_size is in bytes. + Download a httpx response content to a file by streaming it. This + function avoids excessive memory usage when downloading large files. If progress_name is None, no progress bar will be shown. Otherwise a progress bar will appear, if the download is bigger than an internal threshold. """ - with response: - length = size_from_headers(response) - if progress_name and length and int(length) > 1024 * 1024 * 10: # 10 MiB - settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length) - else: - settings = None + length = size_from_headers(response) + if progress_name and length and int(length) > 1024 * 1024 * 10: # 10 MiB + settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length) + else: + settings = None - with open(target, 'wb') as file_descriptor: - with progress_for(settings) as progress: - for chunk in response.iter_content(chunk_size=chunk_size): - file_descriptor.write(chunk) - progress.advance(len(chunk)) + with open(target, 'wb') as file_descriptor: + with progress_for(settings) as progress: + for chunk in response.iter_bytes(): + file_descriptor.write(chunk) + progress.advance(len(chunk)) def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool: diff --git a/requirements.txt b/requirements.txt index 2d852e1..0b805f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests>=2.21.0 +httpx>=0.17.1 beautifulsoup4>=4.7.1 rich>=2.1.0 keyring>=21.5.0 \ No newline at end of file diff --git a/setup.py b/setup.py index a4dfab3..322f2a9 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( version="2.6.1", packages=find_packages(), install_requires=[ - "requests>=2.21.0", + "httpx>=0.17.1", "beautifulsoup4>=4.7.1", "rich>=2.1.0", "keyring>=21.5.0" diff --git a/sync_url.py b/sync_url.py index ca78de0..2ccbc95 100755 --- a/sync_url.py +++ b/sync_url.py @@ -86,7 +86,7 @@ def main() -> None: args = parser.parse_args() cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) - session = cookie_jar.create_session() + client = cookie_jar.create_client() if args.keyring: if not args.username: @@ -103,7 +103,7 @@ def main() -> None: url = urlparse(args.url) - crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, + crawler = IliasCrawler(url.scheme + '://' + url.netloc, client, authenticator, lambda x, y: True) cookie_jar.load_cookies()