transition from requests to httpx

2025-09-08 13:52:27 +02:00 · 2021-04-23 18:02:57 +02:00
parent c1ab7485e2
commit 44aeb6c2eb
14 changed files with 80 additions and 89 deletions
--- a/.github/workflows/package.yml
+++ b/.github/workflows/package.yml
@@ -23,7 +23,7 @@ jobs:
        python-version: '3.x'

    - name: "Install dependencies"
-      run: "pip install setuptools keyring pyinstaller rich requests beautifulsoup4 -f --upgrade"
+      run: "pip install setuptools keyring pyinstaller rich httpx beautifulsoup4 -f --upgrade"

    - name: "Install sync_url.py"
      run: "pyinstaller sync_url.py -F"
--- a/PFERD/cookie_jar.py
+++ b/PFERD/cookie_jar.py
@@ -1,11 +1,11 @@
-"""A helper for requests cookies."""
+"""A helper for httpx cookies."""

 import logging
 from http.cookiejar import LoadError, LWPCookieJar
 from pathlib import Path
 from typing import Optional

-import requests
+import httpx

 LOGGER = logging.getLogger(__name__)

@@ -26,7 +26,7 @@ class CookieJar:

    @property
    def cookies(self) -> LWPCookieJar:
-        """Return the requests cookie jar."""
+        """Return the httpx cookie jar."""
        return self._cookies

    def load_cookies(self) -> None:
@@ -57,13 +57,11 @@ class CookieJar:
        # TODO possibly catch a few more exceptions
        self._cookies.save(ignore_discard=True)

-    def create_session(self) -> requests.Session:
-        """Create a new session using the cookie jar."""
-        sess = requests.Session()
+    def create_client(self) -> httpx.Client:
+        """Create a new client using the cookie jar."""
+        # TODO: timeout=None was the default behaviour of requests. An approprite value should probably be set
+        client = httpx.Client(timeout=None)

-        # From the request docs: "All requests code should work out of the box
-        # with externally provided instances of CookieJar, e.g. LWPCookieJar
-        # and FileCookieJar."
-        sess.cookies = self.cookies  # type: ignore
+        client.cookies = self.cookies  # type: ignore

-        return sess
+        return client
--- a/PFERD/diva.py
+++ b/PFERD/diva.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Callable, List, Optional

-import requests
+import httpx

 from .errors import FatalException
 from .logging import PrettyLogger
@@ -69,7 +69,7 @@ class DivaPlaylistCrawler:
            )
        base_name = match.group(1)

-        response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json")
+        response = httpx.get(cls._PLAYLIST_BASE_URL + base_name + ".json")

        if response.status_code != 200:
            raise FatalException(
@@ -88,7 +88,7 @@ class DivaPlaylistCrawler:
        """
        Crawls the playlist given in the constructor.
        """
-        response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id})
+        response = httpx.get(self._COLLECTION_BASE_URL, params={"collection": self._id})
        if response.status_code != 200:
            raise FatalException(f"Server returned status {response.status_code}.")

@@ -143,7 +143,7 @@ class DivaDownloader:
        self._tmp_dir = tmp_dir
        self._organizer = organizer
        self._strategy = strategy
-        self._session = requests.session()
+        self._client = httpx.Client()

    def download_all(self, infos: List[DivaDownloadInfo]) -> None:
        """
@@ -160,7 +160,7 @@ class DivaDownloader:
            self._organizer.mark(info.path)
            return

-        with self._session.get(info.url, stream=True) as response:
+        with self._client.stream("GET", info.url) as response:
            if response.status_code == 200:
                tmp_file = self._tmp_dir.new_path()
                stream_to_path(response, tmp_file, info.path.name)
--- a/PFERD/downloaders.py
+++ b/PFERD/downloaders.py
@@ -5,8 +5,8 @@ General downloaders useful in many situations
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional

-import requests
-import requests.auth
+import httpx
+import httpx.auth

 from .organizer import Organizer
 from .tmp_dir import TmpDir
@@ -39,15 +39,15 @@ class HttpDownloader:
        self._tmp_dir = tmp_dir
        self._username = username
        self._password = password
-        self._session = self._build_session()
+        self._client = self._build_client()

-    def _build_session(self) -> requests.Session:
-        session = requests.Session()
+    def _build_client(self) -> httpx.Client:
+        client = httpx.Client()
        if self._username and self._password:
-            session.auth = requests.auth.HTTPBasicAuth(
+            client.auth = httpx.auth.HTTPBasicAuth(
                self._username, self._password
            )
-        return session
+        return client

    def download_all(self, infos: List[HttpDownloadInfo]) -> None:
        """
@@ -62,7 +62,7 @@ class HttpDownloader:
        Download a single file.
        """

-        with self._session.get(info.url, params=info.parameters, stream=True) as response:
+        with self._client.stream("GET", info.url, params=info.parameters) as response:
            if response.status_code == 200:
                tmp_file = self._tmp_dir.new_path()
                stream_to_path(response, tmp_file, info.path.name)
--- a/PFERD/ilias/authenticators.py
+++ b/PFERD/ilias/authenticators.py
@@ -7,7 +7,7 @@ import logging
 from typing import Optional

 import bs4
-import requests
+import httpx

 from ..authenticators import TfaAuthenticator, UserPassAuthenticator
 from ..utils import soupify
@@ -19,14 +19,14 @@ class IliasAuthenticator(abc.ABC):
    # pylint: disable=too-few-public-methods

    """
-    An authenticator that logs an existing requests session into an ILIAS
+    An authenticator that logs an existing httpx client into an ILIAS
    account.
    """

    @abc.abstractmethod
-    def authenticate(self, sess: requests.Session) -> None:
+    def authenticate(self, client: httpx.Client) -> None:
        """
-        Log a requests session into this authenticator's ILIAS account.
+        Log a httpx client into this authenticator's ILIAS account.
        """


@@ -45,7 +45,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):

        self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth")

-    def authenticate(self, sess: requests.Session) -> None:
+    def authenticate(self, sess: httpx.Client) -> None:
        """
        Performs the ILIAS Shibboleth authentication dance and saves the login
        cookies it receieves.
@@ -109,7 +109,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):

    def _authenticate_tfa(
            self,
-            session: requests.Session,
+            client: httpx.Client,
            soup: bs4.BeautifulSoup
    ) -> bs4.BeautifulSoup:
        # Searching the form here so that this fails before asking for
@@ -125,7 +125,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
            "_eventId_proceed": "",
            "j_tokenNumber": self._tfa_auth.get_token()
        }
-        return soupify(session.post(url, data=data))
+        return soupify(client.post(url, data=data))

    @staticmethod
    def _login_successful(soup: bs4.BeautifulSoup) -> bool:
--- a/PFERD/ilias/crawler.py
+++ b/PFERD/ilias/crawler.py
@@ -13,7 +13,7 @@ from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
                          urlunsplit)

 import bs4
-import requests
+import httpx

 from ..errors import FatalException, retry_on_io_exception
 from ..logging import PrettyLogger
@@ -96,7 +96,7 @@ class IliasCrawler:
    def __init__(
            self,
            base_url: str,
-            session: requests.Session,
+            client: httpx.Client,
            authenticator: IliasAuthenticator,
            dir_filter: IliasDirectoryFilter
    ):
@@ -105,7 +105,7 @@ class IliasCrawler:
        """

        self._base_url = base_url
-        self._session = session
+        self._client = client
        self._authenticator = authenticator
        self.dir_filter = dir_filter

@@ -157,9 +157,9 @@ class IliasCrawler:
        return self._iterate_entries_to_download_infos(entries)

    def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
-        response: requests.Response = self._session.get(root_url)
+        response: httpx.Response = self._client.get(root_url)
        # We were redirected ==> Non-existant ID
-        if course_id not in response.url:
+        if course_id not in str(response.url):
            return False

        link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link")
@@ -564,7 +564,7 @@ class IliasCrawler:
            # on the page, but defined in a JS object inside a script tag, passed to the player
            # library.
            # We do the impossible and RegEx the stream JSON object out of the page's HTML source
-            video_page_soup = soupify(self._session.get(play_url))
+            video_page_soup = soupify(self._client.get(play_url))
            regex: re.Pattern = re.compile(
                r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
            )
@@ -639,7 +639,7 @@ class IliasCrawler:

        LOGGER.debug("Fetching %r", url)

-        response = self._session.get(url, params=params)
+        response = self._client.get(url, params=params)
        content_type = response.headers["content-type"]

        if not content_type.startswith("text/html"):
@@ -655,7 +655,7 @@ class IliasCrawler:

        LOGGER.info("Not authenticated, changing that...")

-        self._authenticator.authenticate(self._session)
+        self._authenticator.authenticate(self._client)

        return self._get_page(url, params, retry_count + 1)

--- a/PFERD/ilias/downloader.py
+++ b/PFERD/ilias/downloader.py
@@ -8,7 +8,7 @@ from pathlib import Path, PurePath
 from typing import Callable, List, Optional, Union

 import bs4
-import requests
+import httpx

 from ..errors import retry_on_io_exception
 from ..logging import PrettyLogger
@@ -82,21 +82,18 @@ class IliasDownloader:
            self,
            tmp_dir: TmpDir,
            organizer: Organizer,
-            session: requests.Session,
+            client: httpx.Client,
            authenticator: IliasAuthenticator,
            strategy: IliasDownloadStrategy,
            timeout: int = 5
    ):
        """
        Create a new IliasDownloader.
-
-        The timeout applies to the download request only, as bwcloud uses IPv6
-        and requests has a problem with that: https://github.com/psf/requests/issues/5522
        """

        self._tmp_dir = tmp_dir
        self._organizer = organizer
-        self._session = session
+        self._client = client
        self._authenticator = authenticator
        self._strategy = strategy
        self._timeout = timeout
@@ -128,7 +125,7 @@ class IliasDownloader:
        def download_impl() -> bool:
            if not self._try_download(info, tmp_file):
                LOGGER.info("Re-Authenticating due to download failure: %r", info)
-                self._authenticator.authenticate(self._session)
+                self._authenticator.authenticate(self._client)
                raise IOError("Scheduled retry")
            else:
                return True
@@ -153,7 +150,7 @@ class IliasDownloader:
            PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
            return True

-        with self._session.get(url, stream=True, timeout=self._timeout) as response:
+        with self._client.stream("GET", url, timeout=self._timeout) as response:
            content_type = response.headers["content-type"]
            has_content_disposition = "content-disposition" in response.headers

--- a/PFERD/ipd.py
+++ b/PFERD/ipd.py
@@ -11,7 +11,7 @@ from typing import Callable, List, Optional
 from urllib.parse import urljoin

 import bs4
-import requests
+import httpx

 from PFERD.errors import FatalException
 from PFERD.utils import soupify
@@ -78,7 +78,7 @@ class IpdCrawler:
        """
        Crawls the playlist given in the constructor.
        """
-        page = soupify(requests.get(self._base_url))
+        page = soupify(httpx.get(self._base_url))

        items: List[IpdDownloadInfo] = []

@@ -116,7 +116,7 @@ class IpdDownloader:
        self._tmp_dir = tmp_dir
        self._organizer = organizer
        self._strategy = strategy
-        self._session = requests.session()
+        self._client = httpx.Client()

    def download_all(self, infos: List[IpdDownloadInfo]) -> None:
        """
@@ -133,7 +133,7 @@ class IpdDownloader:
            self._organizer.mark(info.path)
            return

-        with self._session.get(info.url, stream=True) as response:
+        with self._client.stream("GET", info.url) as response:
            if response.status_code == 200:
                tmp_file = self._tmp_dir.new_path()
                stream_to_path(response, tmp_file, info.path.name)
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -88,12 +88,12 @@ class Pferd(Location):
    ) -> Organizer:
        # pylint: disable=too-many-locals
        cookie_jar = CookieJar(to_path(cookies) if cookies else None)
-        session = cookie_jar.create_session()
+        client = cookie_jar.create_client()
        tmp_dir = self._tmp_dir.new_subdir()
        organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver)

-        crawler = IliasCrawler(base_url, session, authenticator, dir_filter)
-        downloader = IliasDownloader(tmp_dir, organizer, session,
+        crawler = IliasCrawler(base_url, client, authenticator, dir_filter)
+        downloader = IliasDownloader(tmp_dir, organizer, client,
                                     authenticator, download_strategy, timeout)

        cookie_jar.load_cookies()
@@ -149,11 +149,11 @@ class Pferd(Location):
            password {Optional[str]} -- The SCC password. If none is given, it will prompt
                the user. (default: {None})
            download_strategy {DownloadStrategy} -- A function to determine which files need to
-                be downloaded. Can save bandwidth and reduce the number of requests.
+                be downloaded. Can save bandwidth and reduce the number of httpx.
                (default: {download_modified_or_new})
            clean {bool} -- Whether to clean up when the method finishes.
            timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
-                requests bug.
+                httpx bug.
            file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
                with overwriting or deleting files. The default always asks the user.
        """
@@ -222,8 +222,7 @@ class Pferd(Location):
                be downloaded. Can save bandwidth and reduce the number of requests.
                (default: {download_modified_or_new})
            clean {bool} -- Whether to clean up when the method finishes.
-            timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
-                requests bug.
+            timeout {int} -- The download timeout for opencast videos. 
            file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
                with overwriting or deleting files. The default always asks the user.
        """
@@ -284,11 +283,11 @@ class Pferd(Location):
            password {Optional[str]} -- The SCC password. If none is given, it will prompt
                the user. (default: {None})
            download_strategy {DownloadStrategy} -- A function to determine which files need to
-                be downloaded. Can save bandwidth and reduce the number of requests.
+                be downloaded. Can save bandwidth and reduce the number of httpx.
                (default: {download_modified_or_new})
            clean {bool} -- Whether to clean up when the method finishes.
            timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
-                requests bug.
+                httpx bug.
            file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
                with overwriting or deleting files. The default always asks the user.
        """
@@ -338,7 +337,7 @@ class Pferd(Location):
            transform {Transform} -- A transformation function for the output paths. Return None
                to ignore a file. (default: {lambdax:x})
            download_strategy {DivaDownloadStrategy} -- A function to determine which files need to
-                be downloaded. Can save bandwidth and reduce the number of requests.
+                be downloaded. Can save bandwidth and reduce the number of httpx.
                (default: {diva_download_new})
            clean {bool} -- Whether to clean up when the method finishes.
            file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
@@ -396,7 +395,7 @@ class Pferd(Location):
            transform {Transform} -- A transformation function for the output paths. Return None
                to ignore a file. (default: {lambdax:x})
            download_strategy {DivaDownloadStrategy} -- A function to determine which files need to
-                be downloaded. Can save bandwidth and reduce the number of requests.
+                be downloaded. Can save bandwidth and reduce the number of httpx.
                (default: {diva_download_new})
            clean {bool} -- Whether to clean up when the method finishes.
            file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
--- a/PFERD/progress.py
+++ b/PFERD/progress.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from types import TracebackType
 from typing import Optional, Type

-import requests
+import httpx
 from rich.console import Console
 from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID,
                           TextColumn, TimeRemainingColumn,
@@ -27,12 +27,12 @@ _progress: Progress = Progress(
 )


-def size_from_headers(response: requests.Response) -> Optional[int]:
+def size_from_headers(response: httpx.Response) -> Optional[int]:
    """
    Return the size of the download based on the response headers.

    Arguments:
-        response {requests.Response} -- the response
+        response {httpx.Response} -- the response

    Returns:
        Optional[int] -- the size
--- a/PFERD/utils.py
+++ b/PFERD/utils.py
@@ -7,7 +7,7 @@ from pathlib import Path, PurePath
 from typing import Optional, Tuple, Union

 import bs4
-import requests
+import httpx

 from .progress import ProgressSettings, progress_for, size_from_headers

@@ -35,41 +35,38 @@ def to_pattern(regex: Regex) -> re.Pattern:
    return re.compile(regex)


-def soupify(response: requests.Response) -> bs4.BeautifulSoup:
+def soupify(response: httpx.Response) -> bs4.BeautifulSoup:
    """
-    Wrap a requests response in a bs4 object.
+    Wrap a httpx response in a bs4 object.
    """

    return bs4.BeautifulSoup(response.text, "html.parser")


 def stream_to_path(
-        response: requests.Response,
+        response: httpx.Response,
        target: Path,
        progress_name: Optional[str] = None,
-        chunk_size: int = 1024 ** 2
 ) -> None:
    """
-    Download a requests response content to a file by streaming it. This
-    function avoids excessive memory usage when downloading large files. The
-    chunk_size is in bytes.
+    Download a httpx response content to a file by streaming it. This
+    function avoids excessive memory usage when downloading large files.

    If progress_name is None, no progress bar will be shown. Otherwise a progress
    bar will appear, if the download is bigger than an internal threshold.
    """

-    with response:
-        length = size_from_headers(response)
-        if progress_name and length and int(length) > 1024 * 1024 * 10:  # 10 MiB
-            settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length)
-        else:
-            settings = None
+    length = size_from_headers(response)
+    if progress_name and length and int(length) > 1024 * 1024 * 10:  # 10 MiB
+        settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length)
+    else:
+        settings = None

-        with open(target, 'wb') as file_descriptor:
-            with progress_for(settings) as progress:
-                for chunk in response.iter_content(chunk_size=chunk_size):
-                    file_descriptor.write(chunk)
-                    progress.advance(len(chunk))
+    with open(target, 'wb') as file_descriptor:
+        with progress_for(settings) as progress:
+            for chunk in response.iter_bytes():
+                file_descriptor.write(chunk)
+                progress.advance(len(chunk))


 def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-requests>=2.21.0
+httpx>=0.17.1
 beautifulsoup4>=4.7.1
 rich>=2.1.0
 keyring>=21.5.0
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ setup(
    version="2.6.1",
    packages=find_packages(),
    install_requires=[
-        "requests>=2.21.0",
+        "httpx>=0.17.1",
        "beautifulsoup4>=4.7.1",
        "rich>=2.1.0",
        "keyring>=21.5.0"
--- a/sync_url.py
+++ b/sync_url.py
@@ -86,7 +86,7 @@ def main() -> None:
    args = parser.parse_args()

    cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
-    session = cookie_jar.create_session()
+    client = cookie_jar.create_client()

    if args.keyring:
        if not args.username:
@@ -103,7 +103,7 @@ def main() -> None:

    url = urlparse(args.url)

-    crawler = IliasCrawler(url.scheme + '://' + url.netloc, session,
+    crawler = IliasCrawler(url.scheme + '://' + url.netloc, client,
                           authenticator, lambda x, y: True)

    cookie_jar.load_cookies()