mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Document authentication in HTTP crawler and rename prepare_request
This commit is contained in:
parent
25e2abdb03
commit
33a81a5f5c
@ -333,7 +333,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
sink.done()
|
sink.done()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
auth_id = await self.prepare_request()
|
auth_id = await self._current_auth_id()
|
||||||
if await try_stream():
|
if await try_stream():
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -343,7 +343,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
raise CrawlError("File streaming failed after authenticate()")
|
raise CrawlError("File streaming failed after authenticate()")
|
||||||
|
|
||||||
async def _get_page(self, url: str) -> BeautifulSoup:
|
async def _get_page(self, url: str) -> BeautifulSoup:
|
||||||
auth_id = await self.prepare_request()
|
auth_id = await self._current_auth_id()
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
|
@ -28,20 +28,35 @@ class HttpCrawler(Crawler):
|
|||||||
self._authentication_lock = asyncio.Lock()
|
self._authentication_lock = asyncio.Lock()
|
||||||
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
|
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
|
||||||
|
|
||||||
async def prepare_request(self) -> int:
|
async def _current_auth_id(self) -> int:
|
||||||
|
"""
|
||||||
|
Returns the id for the current authentication, i.e. an identifier for the last
|
||||||
|
successful call to [authenticate].
|
||||||
|
|
||||||
|
This method must be called before any request that might authenticate is made, so the
|
||||||
|
HttpCrawler can properly track when [authenticate] can return early and when actual
|
||||||
|
authentication is necessary.
|
||||||
|
"""
|
||||||
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
|
# We acquire the lock here to ensure we wait for any concurrent authenticate to finish.
|
||||||
# This should reduce the amount of requests we make: If an authentication is in progress
|
# This should reduce the amount of requests we make: If an authentication is in progress
|
||||||
# all future requests wait for authentication to complete.
|
# all future requests wait for authentication to complete.
|
||||||
async with self._authentication_lock:
|
async with self._authentication_lock:
|
||||||
return self._authentication_id
|
return self._authentication_id
|
||||||
|
|
||||||
async def authenticate(self, current_id: int) -> None:
|
async def authenticate(self, caller_auth_id: int) -> None:
|
||||||
|
"""
|
||||||
|
Starts the authentication process. The main work is offloaded to _authenticate, which
|
||||||
|
you should overwrite in a subclass if needed. This method should *NOT* be overwritten.
|
||||||
|
|
||||||
|
The [caller_auth_id] should be the result of a [_current_auth_id] call made *before*
|
||||||
|
the request was made. This ensures that authentication is not performed needlessly.
|
||||||
|
"""
|
||||||
async with self._authentication_lock:
|
async with self._authentication_lock:
|
||||||
# Another thread successfully called authenticate in-between
|
# Another thread successfully called authenticate in-between
|
||||||
# We do not want to perform auth again, so we return here. We can
|
# We do not want to perform auth again, so we return here. We can
|
||||||
# assume the other thread suceeded as authenticate will throw an error
|
# assume the other thread suceeded as authenticate will throw an error
|
||||||
# if it failed and aborts the crawl process.
|
# if it failed and aborts the crawl process.
|
||||||
if current_id != self._authentication_id:
|
if caller_auth_id != self._authentication_id:
|
||||||
return
|
return
|
||||||
await self._authenticate()
|
await self._authenticate()
|
||||||
self._authentication_id += 1
|
self._authentication_id += 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user