mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add some explains to ilias crawler and use crawler exceptions
This commit is contained in:
parent
e4f9560655
commit
83d12fcf2d
@ -6,10 +6,12 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from rich.markup import escape
|
||||
|
||||
from PFERD.authenticators import Authenticator
|
||||
from PFERD.config import Config
|
||||
from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
|
||||
from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical
|
||||
from PFERD.logging import log
|
||||
from PFERD.output_dir import Redownload
|
||||
from PFERD.utils import soupify, url_set_query_param
|
||||
|
||||
@ -66,10 +68,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
|
||||
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
|
||||
|
||||
|
||||
def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
|
||||
def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
|
||||
def decorator(f: AWrapped) -> AWrapped:
|
||||
async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None:
|
||||
for _ in range(attempts - 1):
|
||||
last_exception: Optional[BaseException] = None
|
||||
for round in range(attempts):
|
||||
try:
|
||||
await f(self, *args, **kwargs)
|
||||
return
|
||||
@ -77,12 +80,17 @@ def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]:
|
||||
raise CrawlWarning("ILIAS returned an invalid content type")
|
||||
except aiohttp.TooManyRedirects:
|
||||
raise CrawlWarning("Got stuck in a redirect loop")
|
||||
except aiohttp.ClientPayloadError: # encoding or not enough bytes
|
||||
pass
|
||||
except aiohttp.ClientConnectionError: # e.g. timeout, disconnect, resolve failed, etc.
|
||||
pass
|
||||
except aiohttp.ClientPayloadError as e: # encoding or not enough bytes
|
||||
last_exception = e
|
||||
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
|
||||
last_exception = e
|
||||
log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}")
|
||||
|
||||
if last_exception:
|
||||
message = f"Error in I/O Operation: {escape(str(last_exception))}"
|
||||
raise CrawlWarning(message) from last_exception
|
||||
raise CrawlError("Impossible return in ilias _iorepeat")
|
||||
|
||||
await f(self, *args, **kwargs)
|
||||
return wrapper # type: ignore
|
||||
return decorator
|
||||
|
||||
@ -109,14 +117,19 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
|
||||
async def crawl(self) -> None:
|
||||
if isinstance(self._target, int):
|
||||
log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
|
||||
await self._crawl_course(self._target)
|
||||
elif self._target == "desktop":
|
||||
log.explain_topic("Inferred crawl target: Personal desktop")
|
||||
await self._crawl_desktop()
|
||||
else:
|
||||
log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
|
||||
await self._crawl_url(self._target)
|
||||
|
||||
if self.error_free:
|
||||
await self.cleanup()
|
||||
else:
|
||||
log.explain_topic("Skipping file cleanup as errors occurred earlier")
|
||||
|
||||
async def _crawl_course(self, course_id: int) -> None:
|
||||
# Start crawling at the given course
|
||||
@ -132,15 +145,16 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
||||
tasks = []
|
||||
|
||||
# TODO: Retry this when the crawl and download bar are reworked
|
||||
async with self.crawl_bar(PurePath("Root element")):
|
||||
soup = await self._get_page(url)
|
||||
|
||||
if expected_id is not None:
|
||||
perma_link_element: Tag = soup.find(id="current_perma_link")
|
||||
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
|
||||
# TODO: Properly handle error
|
||||
raise RuntimeError(
|
||||
"Invalid course id? I didn't find anything looking like a course!")
|
||||
raise CrawlError(
|
||||
"Invalid course id? I didn't find anything looking like a course"
|
||||
)
|
||||
|
||||
# Duplicated code, but the root page is special - we want to void fetching it twice!
|
||||
page = IliasPage(soup, url, None)
|
||||
@ -167,15 +181,14 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
@anoncritical
|
||||
@_iorepeat(3)
|
||||
@_iorepeat(3, "ILIAS element crawling")
|
||||
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
|
||||
element_path = PurePath(parent_path, element.name)
|
||||
|
||||
if element.type == IliasElementType.FILE:
|
||||
await self._download_file(element, element_path)
|
||||
elif element.type == IliasElementType.FORUM:
|
||||
# TODO: Delete
|
||||
print(f"Skipping forum [green]{element_path}[/]")
|
||||
log.explain_topic(f"Skipping forum at {escape(str(element_path))}")
|
||||
elif element.type == IliasElementType.LINK:
|
||||
await self._download_link(element, element_path)
|
||||
elif element.type == IliasElementType.VIDEO:
|
||||
@ -185,8 +198,9 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
elif element.type in _DIRECTORY_PAGES:
|
||||
await self._handle_ilias_page(element.url, element, element_path)
|
||||
else:
|
||||
# TODO: Proper exception
|
||||
raise RuntimeError(f"Unknown type: {element.type!r}")
|
||||
# This will retry it a few times, failing everytime. It doesn't make any network
|
||||
# requests, so that's fine.
|
||||
raise CrawlWarning(f"Unknown element type: {element.type!r}")
|
||||
|
||||
async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||
dl = await self.download(element_path, mtime=element.mtime)
|
||||
|
Loading…
Reference in New Issue
Block a user