Add timeout for HTTP connection

This commit is contained in:
I-Al-Istannen 2021-05-23 23:40:28 +02:00
parent 8dd0689420
commit 3ab3581f84
4 changed files with 26 additions and 5 deletions

View File

@ -140,7 +140,9 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S
- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link - `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link
target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional
HTML page instead. HTML page instead.
- `no-videos`: If this is set to true, PFERD will not crawl or download any videos. - `videos`: If this is set to false, PFERD will not crawl or download any videos.
- `http_timeout`: The timeout for http requests
## Authenticator types ## Authenticator types
### The `simple` authenticator ### The `simple` authenticator

View File

@ -52,6 +52,12 @@ GROUP.add_argument(
action=BooleanOptionalAction, action=BooleanOptionalAction,
help="use plain text files for external links" help="use plain text files for external links"
) )
GROUP.add_argument(
"--http-timeout",
type=float,
metavar="SECONDS",
help="the timeout to use for HTTP requests"
)
def load( def load(
@ -72,6 +78,8 @@ def load(
section["link_file_plaintext"] = str(args.link_file_plaintext) section["link_file_plaintext"] = str(args.link_file_plaintext)
if args.videos is not None: if args.videos is not None:
section["videos"] = str(False) section["videos"] = str(False)
if args.http_timeout is not None:
section["http_timeout"] = str(args.http_timeout)
parser["auth:kit-ilias-web"] = {} parser["auth:kit-ilias-web"] = {}
auth_section = parser["auth:kit-ilias-web"] auth_section = parser["auth:kit-ilias-web"]

View File

@ -3,6 +3,7 @@ from pathlib import PurePath
from typing import Optional from typing import Optional
import aiohttp import aiohttp
from aiohttp.client import ClientTimeout
from ..config import Config from ..config import Config
from ..logging import log from ..logging import log
@ -11,13 +12,18 @@ from ..version import NAME, VERSION
from .crawler import Crawler, CrawlerSection from .crawler import Crawler, CrawlerSection
class HttpCrawlerSection(CrawlerSection):
def http_timeout(self) -> float:
return self.s.getfloat("http_timeout", fallback=20)
class HttpCrawler(Crawler): class HttpCrawler(Crawler):
COOKIE_FILE = PurePath(".cookies") COOKIE_FILE = PurePath(".cookies")
def __init__( def __init__(
self, self,
name: str, name: str,
section: CrawlerSection, section: HttpCrawlerSection,
config: Config, config: Config,
) -> None: ) -> None:
super().__init__(name, section, config) super().__init__(name, section, config)
@ -28,6 +34,7 @@ class HttpCrawler(Crawler):
self._authentication_lock = asyncio.Lock() self._authentication_lock = asyncio.Lock()
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
self._request_count = 0 self._request_count = 0
self._http_timeout = section.http_timeout()
async def _current_auth_id(self) -> int: async def _current_auth_id(self) -> int:
""" """
@ -97,6 +104,7 @@ class HttpCrawler(Crawler):
async with aiohttp.ClientSession( async with aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"}, headers={"User-Agent": f"{NAME}/{VERSION}"},
cookie_jar=self._current_cookie_jar, cookie_jar=self._current_cookie_jar,
timeout=ClientTimeout(total=self._http_timeout)
) as session: ) as session:
self.session = session self.session = session
try: try:

View File

@ -1,3 +1,4 @@
import asyncio
import re import re
from pathlib import PurePath from pathlib import PurePath
from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union
@ -11,15 +12,15 @@ from ...config import Config
from ...logging import ProgressBar, log from ...logging import ProgressBar, log
from ...output_dir import FileSink, Redownload from ...output_dir import FileSink, Redownload
from ...utils import fmt_path, soupify, url_set_query_param from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical from ..crawler import CrawlError, CrawlWarning, anoncritical
from ..http_crawler import HttpCrawler from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import link_template_plain, link_template_rich from .file_templates import link_template_plain, link_template_rich
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
TargetType = Union[str, int] TargetType = Union[str, int]
class KitIliasWebCrawlerSection(CrawlerSection): class KitIliasWebCrawlerSection(HttpCrawlerSection):
def target(self) -> TargetType: def target(self) -> TargetType:
target = self.s.get("target") target = self.s.get("target")
@ -92,6 +93,8 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
last_exception = e last_exception = e
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
last_exception = e last_exception = e
except asyncio.exceptions.TimeoutError as e: # explicit http timeouts in HttpCrawler
last_exception = e
log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}")
if last_exception: if last_exception: