Switch http_crawler to requests

This commit is contained in:
Joscha 2022-11-24 11:24:05 +01:00
parent 5c3942a13d
commit 5adfdfbd2b
2 changed files with 22 additions and 52 deletions

View File

@ -1,12 +1,9 @@
import asyncio import asyncio
import http.cookies from http.cookiejar import LWPCookieJar
import ssl
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import Any, Dict, List, Optional from typing import Dict, List, Optional
import aiohttp import requests
import certifi
from aiohttp.client import ClientTimeout
from ..auth import Authenticator from ..auth import Authenticator
from ..config import Config from ..config import Config
@ -35,9 +32,9 @@ class HttpCrawler(Crawler):
self._authentication_id = 0 self._authentication_id = 0
self._authentication_lock = asyncio.Lock() self._authentication_lock = asyncio.Lock()
self._request_count = 0 self._http_timeout = section.http_timeout() # TODO Use or remove
self._http_timeout = section.http_timeout()
self._cookie_jar = LWPCookieJar()
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._shared_cookie_jar_paths: Optional[List[Path]] = None self._shared_cookie_jar_paths: Optional[List[Path]] = None
self._shared_auth = shared_auth self._shared_auth = shared_auth
@ -57,7 +54,6 @@ class HttpCrawler(Crawler):
# This should reduce the amount of requests we make: If an authentication is in progress # This should reduce the amount of requests we make: If an authentication is in progress
# all future requests wait for authentication to complete. # all future requests wait for authentication to complete.
async with self._authentication_lock: async with self._authentication_lock:
self._request_count += 1
return self._authentication_id return self._authentication_id
async def authenticate(self, caller_auth_id: int) -> None: async def authenticate(self, caller_auth_id: int) -> None:
@ -106,32 +102,13 @@ class HttpCrawler(Crawler):
self._shared_cookie_jar_paths.append(self._cookie_jar_path) self._shared_cookie_jar_paths.append(self._cookie_jar_path)
def _load_cookies_from_file(self, path: Path) -> None:
jar: Any = http.cookies.SimpleCookie()
with open(path, encoding="utf-8") as f:
for i, line in enumerate(f):
# Names of headers are case insensitive
if line[:11].lower() == "set-cookie:":
jar.load(line[11:])
else:
log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it")
self._cookie_jar.update_cookies(jar)
def _save_cookies_to_file(self, path: Path) -> None:
jar: Any = http.cookies.SimpleCookie()
for morsel in self._cookie_jar:
jar[morsel.key] = morsel
with open(path, "w", encoding="utf-8") as f:
f.write(jar.output(sep="\n"))
f.write("\n") # A trailing newline is just common courtesy
def _load_cookies(self) -> None: def _load_cookies(self) -> None:
log.explain_topic("Loading cookies") log.explain_topic("Loading cookies")
cookie_jar_path: Optional[Path] = None cookie_jar_path: Optional[Path] = None
if self._shared_cookie_jar_paths is None: if self._shared_cookie_jar_paths is None:
log.explain("Not sharing any cookies") log.explain("Not sharing cookies")
cookie_jar_path = self._cookie_jar_path cookie_jar_path = self._cookie_jar_path
else: else:
log.explain("Sharing cookies") log.explain("Sharing cookies")
@ -154,46 +131,38 @@ class HttpCrawler(Crawler):
log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
try: try:
self._load_cookies_from_file(cookie_jar_path) self._cookie_jar.load(filename=str(cookie_jar_path))
except Exception as e: except Exception as e:
log.explain("Failed to load cookies") log.explain(f"Failed to load cookies: {e}")
log.explain(str(e)) log.explain("Proceeding without cookies")
def _save_cookies(self) -> None: def _save_cookies(self) -> None:
log.explain_topic("Saving cookies") log.explain_topic("Saving cookies")
try: try:
log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
self._save_cookies_to_file(self._cookie_jar_path) self._cookie_jar.save(filename=str(self._cookie_jar_path))
except Exception as e: except Exception as e:
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(f"Failed to save cookies: {e}")
log.warn(str(e))
async def run(self) -> None: async def run(self) -> None:
self._request_count = 0 self._request_count = 0
self._cookie_jar = aiohttp.CookieJar()
self._load_cookies() self._load_cookies()
async with aiohttp.ClientSession( self.session = requests.Session()
headers={"User-Agent": f"{NAME}/{VERSION}"}, self.session.headers["User-Agent"] = f"{NAME}/{VERSION}"
cookie_jar=self._cookie_jar,
connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), # From the request docs: "All requests code should work out of the box
timeout=ClientTimeout( # with externally provided instances of CookieJar, e.g. LWPCookieJar and
# 30 minutes. No download in the history of downloads was longer than 30 minutes. # FileCookieJar."
# This is enough to transfer a 600 MB file over a 3 Mib/s connection. # https://requests.readthedocs.io/en/latest/api/#requests.cookies.RequestsCookieJar
# Allowing an arbitrary value could be annoying for overnight batch jobs self.session.cookies = self._cookie_jar # type: ignore
total=15 * 60,
connect=self._http_timeout, with self.session:
sock_connect=self._http_timeout,
sock_read=self._http_timeout,
)
) as session:
self.session = session
try: try:
await super().run() await super().run()
finally: finally:
del self.session del self.session
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
# They are saved in authenticate, but a final save won't hurt # They are saved in authenticate, but a final save won't hurt
self._save_cookies() self._save_cookies()

View File

@ -11,6 +11,7 @@ install_requires =
rich>=11.0.0 rich>=11.0.0
keyring>=23.5.0 keyring>=23.5.0
certifi>=2021.10.8 certifi>=2021.10.8
requests>=2.28.1
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =