Only retry on aiohttp errors in ILIAS crawler

This patch removes quite a few retries and now only retries the ilias
element method. Every other HTTP-interacting method (except for the root
requests) is called from there and should be covered.

In the future we also want to retry the root a few times, but that
will be done after the download sink API is adjusted.
This commit is contained in:
I-Al-Istannen 2021-05-19 22:01:09 +02:00
parent 8cfa818f04
commit e4f9560655

View File

@ -2,14 +2,14 @@ import asyncio
import re import re
from pathlib import PurePath from pathlib import PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated # TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import Any, Dict, Optional, Set, Union from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
import aiohttp import aiohttp
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from PFERD.authenticators import Authenticator from PFERD.authenticators import Authenticator
from PFERD.config import Config from PFERD.config import Config
from PFERD.crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
from PFERD.output_dir import Redownload from PFERD.output_dir import Redownload
from PFERD.utils import soupify, url_set_query_param from PFERD.utils import soupify, url_set_query_param
@ -63,6 +63,29 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
]) ])
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
def decorator(f: AWrapped) -> AWrapped:
async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None:
for _ in range(attempts - 1):
try:
await f(self, *args, **kwargs)
return
except aiohttp.ContentTypeError: # invalid content type
raise CrawlWarning("ILIAS returned an invalid content type")
except aiohttp.TooManyRedirects:
raise CrawlWarning("Got stuck in a redirect loop")
except aiohttp.ClientPayloadError: # encoding or not enough bytes
pass
except aiohttp.ClientConnectionError: # e.g. timeout, disconnect, resolve failed, etc.
pass
await f(self, *args, **kwargs)
return wrapper # type: ignore
return decorator
class KitIliasWebCrawler(HttpCrawler): class KitIliasWebCrawler(HttpCrawler):
def __init__( def __init__(
@ -106,7 +129,6 @@ class KitIliasWebCrawler(HttpCrawler):
async def _crawl_desktop(self) -> None: async def _crawl_desktop(self) -> None:
await self._crawl_url(self._base_url) await self._crawl_url(self._base_url)
@arepeat(3)
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
tasks = [] tasks = []
@ -127,8 +149,6 @@ class KitIliasWebCrawler(HttpCrawler):
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
@arepeat(3)
@anoncritical
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
# We might not want to crawl this directory-ish page. # We might not want to crawl this directory-ish page.
# This is not in #handle_element, as the download methods check it themselves and therefore # This is not in #handle_element, as the download methods check it themselves and therefore
@ -147,12 +167,10 @@ class KitIliasWebCrawler(HttpCrawler):
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
@anoncritical @anoncritical
@_iorepeat(3)
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
if not self.should_crawl(element_path):
return
if element.type == IliasElementType.FILE: if element.type == IliasElementType.FILE:
await self._download_file(element, element_path) await self._download_file(element, element_path)
elif element.type == IliasElementType.FORUM: elif element.type == IliasElementType.FORUM:
@ -170,7 +188,6 @@ class KitIliasWebCrawler(HttpCrawler):
# TODO: Proper exception # TODO: Proper exception
raise RuntimeError(f"Unknown type: {element.type!r}") raise RuntimeError(f"Unknown type: {element.type!r}")
@arepeat(3)
async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
dl = await self.download(element_path, mtime=element.mtime) dl = await self.download(element_path, mtime=element.mtime)
if not dl: if not dl:
@ -191,7 +208,6 @@ class KitIliasWebCrawler(HttpCrawler):
sink.file.write(content.encode("utf-8")) sink.file.write(content.encode("utf-8"))
sink.done() sink.done()
@arepeat(3)
async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
# Videos will NOT be redownloaded - their content doesn't really change and they are chunky # Videos will NOT be redownloaded - their content doesn't really change and they are chunky
dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER)
@ -212,7 +228,6 @@ class KitIliasWebCrawler(HttpCrawler):
sink.done() sink.done()
@arepeat(3)
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
dl = await self.download(element_path, mtime=element.mtime) dl = await self.download(element_path, mtime=element.mtime)
if not dl: if not dl: