mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-31 12:52:43 +01:00 
			
		
		
		
	Document authentication in HTTP crawler and rename prepare_request
This commit is contained in:
		| @@ -333,7 +333,7 @@ class KitIliasWebCrawler(HttpCrawler): | ||||
|                 sink.done() | ||||
|             return True | ||||
|  | ||||
|         auth_id = await self.prepare_request() | ||||
|         auth_id = await self._current_auth_id() | ||||
|         if await try_stream(): | ||||
|             return | ||||
|  | ||||
| @@ -343,7 +343,7 @@ class KitIliasWebCrawler(HttpCrawler): | ||||
|             raise CrawlError("File streaming failed after authenticate()") | ||||
|  | ||||
|     async def _get_page(self, url: str) -> BeautifulSoup: | ||||
|         auth_id = await self.prepare_request() | ||||
|         auth_id = await self._current_auth_id() | ||||
|         async with self.session.get(url) as request: | ||||
|             soup = soupify(await request.read()) | ||||
|             if self._is_logged_in(soup): | ||||
|   | ||||
| @@ -28,20 +28,35 @@ class HttpCrawler(Crawler): | ||||
|         self._authentication_lock = asyncio.Lock() | ||||
|         self._current_cookie_jar: Optional[aiohttp.CookieJar] = None | ||||
|  | ||||
|     async def prepare_request(self) -> int: | ||||
|     async def _current_auth_id(self) -> int: | ||||
|         """ | ||||
|         Returns the id for the current authentication, i.e. an identifier for the last | ||||
|         successful call to [authenticate]. | ||||
|  | ||||
|         This method must be called before any request that might authenticate is made, so the | ||||
|         HttpCrawler can properly track when [authenticate] can return early and when actual | ||||
|         authentication is necessary. | ||||
|         """ | ||||
|         # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. | ||||
|         # This should reduce the amount of requests we make: If an authentication is in progress | ||||
|         # all future requests wait for authentication to complete. | ||||
|         async with self._authentication_lock: | ||||
|             return self._authentication_id | ||||
|  | ||||
|     async def authenticate(self, current_id: int) -> None: | ||||
|     async def authenticate(self, caller_auth_id: int) -> None: | ||||
|         """ | ||||
|         Starts the authentication process. The main work is offloaded to _authenticate, which | ||||
|         you should overwrite in a subclass if needed. This method should *NOT* be overwritten. | ||||
|  | ||||
|         The [caller_auth_id] should be the result of a [_current_auth_id] call made *before* | ||||
|         the request was made. This ensures that authentication is not performed needlessly. | ||||
|         """ | ||||
|         async with self._authentication_lock: | ||||
|             # Another thread successfully called authenticate in-between | ||||
|             # We do not want to perform auth again, so we return here. We can | ||||
|             # assume the other thread suceeded as authenticate will throw an error | ||||
|             # if it failed and aborts the crawl process. | ||||
|             if current_id != self._authentication_id: | ||||
|             if caller_auth_id != self._authentication_id: | ||||
|                 return | ||||
|             await self._authenticate() | ||||
|             self._authentication_id += 1 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 I-Al-Istannen
					I-Al-Istannen