Add some explains to ilias crawler and use crawler exceptions

This commit is contained in:
I-Al-Istannen 2021-05-20 14:58:54 +02:00
parent e4f9560655
commit 83d12fcf2d

View File

@ -6,10 +6,12 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
import aiohttp import aiohttp
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from rich.markup import escape
from PFERD.authenticators import Authenticator from PFERD.authenticators import Authenticator
from PFERD.config import Config from PFERD.config import Config
from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
from PFERD.logging import log
from PFERD.output_dir import Redownload from PFERD.output_dir import Redownload
from PFERD.utils import soupify, url_set_query_param from PFERD.utils import soupify, url_set_query_param
@ -66,10 +68,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
def decorator(f: AWrapped) -> AWrapped: def decorator(f: AWrapped) -> AWrapped:
async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None: async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None:
for _ in range(attempts - 1): last_exception: Optional[BaseException] = None
for round in range(attempts):
try: try:
await f(self, *args, **kwargs) await f(self, *args, **kwargs)
return return
@ -77,12 +80,17 @@ def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
raise CrawlWarning("ILIAS returned an invalid content type") raise CrawlWarning("ILIAS returned an invalid content type")
except aiohttp.TooManyRedirects: except aiohttp.TooManyRedirects:
raise CrawlWarning("Got stuck in a redirect loop") raise CrawlWarning("Got stuck in a redirect loop")
except aiohttp.ClientPayloadError: # encoding or not enough bytes except aiohttp.ClientPayloadError as e: # encoding or not enough bytes
pass last_exception = e
except aiohttp.ClientConnectionError: # e.g. timeout, disconnect, resolve failed, etc. except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
pass last_exception = e
log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}")
if last_exception:
message = f"Error in I/O Operation: {escape(str(last_exception))}"
raise CrawlWarning(message) from last_exception
raise CrawlError("Impossible return in ilias _iorepeat")
await f(self, *args, **kwargs)
return wrapper # type: ignore return wrapper # type: ignore
return decorator return decorator
@ -109,14 +117,19 @@ class KitIliasWebCrawler(HttpCrawler):
async def crawl(self) -> None: async def crawl(self) -> None:
if isinstance(self._target, int): if isinstance(self._target, int):
log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
await self._crawl_course(self._target) await self._crawl_course(self._target)
elif self._target == "desktop": elif self._target == "desktop":
log.explain_topic("Inferred crawl target: Personal desktop")
await self._crawl_desktop() await self._crawl_desktop()
else: else:
log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
await self._crawl_url(self._target) await self._crawl_url(self._target)
if self.error_free: if self.error_free:
await self.cleanup() await self.cleanup()
else:
log.explain_topic("Skipping file cleanup as errors occurred earlier")
async def _crawl_course(self, course_id: int) -> None: async def _crawl_course(self, course_id: int) -> None:
# Start crawling at the given course # Start crawling at the given course
@ -132,15 +145,16 @@ class KitIliasWebCrawler(HttpCrawler):
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
tasks = [] tasks = []
# TODO: Retry this when the crawl and download bar are reworked
async with self.crawl_bar(PurePath("Root element")): async with self.crawl_bar(PurePath("Root element")):
soup = await self._get_page(url) soup = await self._get_page(url)
if expected_id is not None: if expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link") perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"): if not perma_link_element or "crs_" not in perma_link_element.get("value"):
# TODO: Properly handle error raise CrawlError(
raise RuntimeError( "Invalid course id? I didn't find anything looking like a course"
"Invalid course id? I didn't find anything looking like a course!") )
# Duplicated code, but the root page is special - we want to void fetching it twice! # Duplicated code, but the root page is special - we want to void fetching it twice!
page = IliasPage(soup, url, None) page = IliasPage(soup, url, None)
@ -167,15 +181,14 @@ class KitIliasWebCrawler(HttpCrawler):
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
@anoncritical @anoncritical
@_iorepeat(3) @_iorepeat(3, "ILIAS element crawling")
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
if element.type == IliasElementType.FILE: if element.type == IliasElementType.FILE:
await self._download_file(element, element_path) await self._download_file(element, element_path)
elif element.type == IliasElementType.FORUM: elif element.type == IliasElementType.FORUM:
# TODO: Delete log.explain_topic(f"Skipping forum at {escape(str(element_path))}")
print(f"Skipping forum [green]{element_path}[/]")
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
await self._download_link(element, element_path) await self._download_link(element, element_path)
elif element.type == IliasElementType.VIDEO: elif element.type == IliasElementType.VIDEO:
@ -185,8 +198,9 @@ class KitIliasWebCrawler(HttpCrawler):
elif element.type in _DIRECTORY_PAGES: elif element.type in _DIRECTORY_PAGES:
await self._handle_ilias_page(element.url, element, element_path) await self._handle_ilias_page(element.url, element, element_path)
else: else:
# TODO: Proper exception # This will retry it a few times, failing everytime. It doesn't make any network
raise RuntimeError(f"Unknown type: {element.type!r}") # requests, so that's fine.
raise CrawlWarning(f"Unknown element type: {element.type!r}")
async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
dl = await self.download(element_path, mtime=element.mtime) dl = await self.download(element_path, mtime=element.mtime)