Document authentication in HTTP crawler and rename prepare_request

This commit is contained in:
I-Al-Istannen 2021-05-23 11:55:34 +02:00
parent 25e2abdb03
commit 33a81a5f5c
2 changed files with 20 additions and 5 deletions

View File

@ -333,7 +333,7 @@ class KitIliasWebCrawler(HttpCrawler):
sink.done()
return True
auth_id = await self.prepare_request()
auth_id = await self._current_auth_id()
if await try_stream():
return
@ -343,7 +343,7 @@ class KitIliasWebCrawler(HttpCrawler):
raise CrawlError("File streaming failed after authenticate()")
async def _get_page(self, url: str) -> BeautifulSoup:
auth_id = await self.prepare_request()
auth_id = await self._current_auth_id()
async with self.session.get(url) as request:
soup = soupify(await request.read())
if self._is_logged_in(soup):

View File

@ -28,20 +28,35 @@ class HttpCrawler(Crawler):
self._authentication_lock = asyncio.Lock()
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
async def prepare_request(self) -> int:
async def _current_auth_id(self) -> int:
"""
Returns the id for the current authentication, i.e. an identifier for the last
successful call to [authenticate].
This method must be called before any request that might authenticate is made, so the
HttpCrawler can properly track when [authenticate] can return early and when actual
authentication is necessary.
"""
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
# This should reduce the amount of requests we make: If an authentication is in progress
# all future requests wait for authentication to complete.
async with self._authentication_lock:
return self._authentication_id
async def authenticate(self, current_id: int) -> None:
async def authenticate(self, caller_auth_id: int) -> None:
"""
Starts the authentication process. The main work is offloaded to _authenticate, which
you should overwrite in a subclass if needed. This method should *NOT* be overwritten.
The [caller_auth_id] should be the result of a [_current_auth_id] call made *before*
the request was made. This ensures that authentication is not performed needlessly.
"""
async with self._authentication_lock:
# Another thread successfully called authenticate in-between
# We do not want to perform auth again, so we return here. We can
# assume the other thread suceeded as authenticate will throw an error
# if it failed and aborts the crawl process.
if current_id != self._authentication_id:
if caller_auth_id != self._authentication_id:
return
await self._authenticate()
self._authentication_id += 1