From 3ab3581f849ae5ee223c434752dfeffdf30884a9 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 23:40:28 +0200 Subject: [PATCH] Add timeout for HTTP connection --- CONFIG.md | 4 +++- PFERD/cli/command_kit_ilias_web.py | 8 ++++++++ PFERD/crawl/http_crawler.py | 10 +++++++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 ++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index b976b7d..dcc7421 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -140,7 +140,9 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S - `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional HTML page instead. -- `no-videos`: If this is set to true, PFERD will not crawl or download any videos. +- `videos`: If this is set to false, PFERD will not crawl or download any videos. +- `http_timeout`: The timeout for http requests + ## Authenticator types ### The `simple` authenticator diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index e47bc77..89da390 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -52,6 +52,12 @@ GROUP.add_argument( action=BooleanOptionalAction, help="use plain text files for external links" ) +GROUP.add_argument( + "--http-timeout", + type=float, + metavar="SECONDS", + help="the timeout to use for HTTP requests" +) def load( @@ -72,6 +78,8 @@ def load( section["link_file_plaintext"] = str(args.link_file_plaintext) if args.videos is not None: section["videos"] = str(False) + if args.http_timeout is not None: + section["http_timeout"] = str(args.http_timeout) parser["auth:kit-ilias-web"] = {} auth_section = parser["auth:kit-ilias-web"] diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index e82dfed..8cd6afe 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -3,6 +3,7 @@ from pathlib import PurePath from typing import Optional import aiohttp +from aiohttp.client import ClientTimeout from ..config import Config from ..logging import log @@ -11,13 +12,18 @@ from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection +class HttpCrawlerSection(CrawlerSection): + def http_timeout(self) -> float: + return self.s.getfloat("http_timeout", fallback=20) + + class HttpCrawler(Crawler): COOKIE_FILE = PurePath(".cookies") def __init__( self, name: str, - section: CrawlerSection, + section: HttpCrawlerSection, config: Config, ) -> None: super().__init__(name, section, config) @@ -28,6 +34,7 @@ class HttpCrawler(Crawler): self._authentication_lock = asyncio.Lock() self._current_cookie_jar: Optional[aiohttp.CookieJar] = None self._request_count = 0 + self._http_timeout = section.http_timeout() async def _current_auth_id(self) -> int: """ @@ -97,6 +104,7 @@ class HttpCrawler(Crawler): async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, cookie_jar=self._current_cookie_jar, + timeout=ClientTimeout(total=self._http_timeout) ) as session: self.session = session try: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 33356ed..445997f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,3 +1,4 @@ +import asyncio import re from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union @@ -11,15 +12,15 @@ from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical -from ..http_crawler import HttpCrawler +from ..crawler import CrawlError, CrawlWarning, anoncritical +from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import link_template_plain, link_template_rich from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] -class KitIliasWebCrawlerSection(CrawlerSection): +class KitIliasWebCrawlerSection(HttpCrawlerSection): def target(self) -> TargetType: target = self.s.get("target") @@ -92,6 +93,8 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: last_exception = e except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. last_exception = e + except asyncio.exceptions.TimeoutError as e: # explicit http timeouts in HttpCrawler + last_exception = e log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") if last_exception: