Only enclose get_page in iorepeat in ILIAS crawler

We previously also gathered in there, which could lead to some more
surprises when the method was retried.
This commit is contained in:
I-Al-Istannen 2021-05-23 17:24:05 +02:00
parent 29d5a40c57
commit 05ad06fbc1

View File

@ -1,6 +1,6 @@
import re import re
from pathlib import PurePath from pathlib import PurePath
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union
import aiohttp import aiohttp
from aiohttp import hdrs from aiohttp import hdrs
@ -192,10 +192,11 @@ class KitIliasWebCrawler(HttpCrawler):
return return
cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
@_iorepeat(3, "crawling url") elements: List[IliasPageElement] = []
async def impl() -> None:
tasks = []
@_iorepeat(3, "crawling url")
async def gather_elements() -> None:
elements.clear()
async with cl: async with cl:
soup = await self._get_page(url) soup = await self._get_page(url)
@ -204,48 +205,44 @@ class KitIliasWebCrawler(HttpCrawler):
if not perma_link_element or "crs_" not in perma_link_element.get("value"): if not perma_link_element or "crs_" not in perma_link_element.get("value"):
raise CrawlError("Invalid course id? Didn't find anything looking like a course") raise CrawlError("Invalid course id? Didn't find anything looking like a course")
# Duplicated code, but the root page is special - we want to void fetching it twice! # Duplicated code, but the root page is special - we want to avoid fetching it twice!
page = IliasPage(soup, url, None) page = IliasPage(soup, url, None)
for child in page.get_child_elements(): elements.extend(page.get_child_elements())
tasks.append(self._handle_ilias_element(PurePath("."), child))
# The only point an I/O exception can be thrown is in `get_page`. # Fill up our task list with the found elements
# If that happens, no task was spawned yet. Therefore, we only retry await gather_elements()
# this method without having spawned a single task. Due to this we do tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
# not need to cancel anything or worry about this gather call or the forks
# further up. # And execute them
await self.gather(tasks) await self.gather(tasks)
await impl()
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
maybe_cl = await self.crawl(path) maybe_cl = await self.crawl(path)
if not maybe_cl: if not maybe_cl:
return return
cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
elements: List[IliasPageElement] = []
@_iorepeat(3, "crawling folder") @_iorepeat(3, "crawling folder")
async def impl() -> None: async def gather_elements() -> None:
tasks = [] elements.clear()
async with cl: async with cl:
soup = await self._get_page(url) soup = await self._get_page(url)
page = IliasPage(soup, url, parent) page = IliasPage(soup, url, parent)
for child in page.get_child_elements(): elements.extend(page.get_child_elements())
tasks.append(self._handle_ilias_element(path, child))
# The only point an I/O exception can be thrown is in `get_page`. # Fill up our task list with the found elements
# If that happens, no task was spawned yet. Therefore, we only retry await gather_elements()
# this method without having spawned a single task. Due to this we do tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
# not need to cancel anything or worry about this gather call or the forks
# further up. # And execute them
await self.gather(tasks) await self.gather(tasks)
await impl()
@anoncritical @anoncritical
# Shouldn't happen but this method must never raise an I/O error as that might interfere with # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical.
# handle_ilias_page or crawl_url # If that happens we will be terminated as anoncritical doesn't tream them as non-critical.
@_wrap_io_in_warning("handling ilias element") @_wrap_io_in_warning("handling ilias element")
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)