Implement cookie sharing

This commit is contained in:
Joscha 2021-05-24 13:10:19 +02:00
parent fca62541ca
commit c687d4a51a
6 changed files with 95 additions and 21 deletions

View File

@ -25,6 +25,11 @@ default values for the other sections.
`yes`)
- `report`: Whether PFERD should print a report of added, changed and deleted
local files for all crawlers before exiting. (Default: `yes`)
- `share_cookies`: Whether crawlers should share cookies where applicable. By
default, crawlers are isolated and don't interact with each other. This
includes their cookies. However, in situations where multiple crawlers crawl
the same website using the same account, sharing cookies between crawlers can
make sense. (Default: `yes`)
## The `crawl:*` sections

View File

@ -169,6 +169,11 @@ PARSER.add_argument(
action=BooleanOptionalAction,
help="print a report of all local changes before exiting"
)
PARSER.add_argument(
"--share-cookies",
action=BooleanOptionalAction,
help="whether crawlers should share cookies where applicable"
)
def load_default_section(
@ -180,7 +185,9 @@ def load_default_section(
if args.working_dir is not None:
section["working_dir"] = str(args.working_dir)
if args.explain is not None:
section["explain"] = "true" if args.explain else "false"
section["explain"] = "yes" if args.explain else "no"
if args.share_cookies is not None:
section["share_cookies"] = "yes" if args.share_cookies else "no"
SUBPARSERS = PARSER.add_subparsers(title="crawlers")

View File

@ -81,6 +81,9 @@ class DefaultSection(Section):
def report(self) -> bool:
return self.s.getboolean("report", fallback=True)
def share_cookies(self) -> bool:
return self.s.getboolean("share_cookies", fallback=True)
class Config:
@staticmethod

View File

@ -1,10 +1,11 @@
import asyncio
from pathlib import PurePath
from typing import Optional
from pathlib import Path, PurePath
from typing import Dict, List, Optional
import aiohttp
from aiohttp.client import ClientTimeout
from ..auth import Authenticator
from ..config import Config
from ..logging import log
from ..utils import fmt_real_path
@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
name: str,
section: HttpCrawlerSection,
config: Config,
shared_auth: Optional[Authenticator] = None,
) -> None:
super().__init__(name, section, config)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
self._authentication_id = 0
self._authentication_lock = asyncio.Lock()
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
self._request_count = 0
self._http_timeout = section.http_timeout()
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._shared_cookie_jar_paths: Optional[List[Path]] = None
self._shared_auth = shared_auth
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
self._output_dir.register_reserved(self.COOKIE_FILE)
async def _current_auth_id(self) -> int:
"""
Returns the id for the current authentication, i.e. an identifier for the last
@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
self._authentication_id += 1
# Saving the cookies after the first auth ensures we won't need to re-authenticate
# on the next run, should this one be aborted or crash
await self._save_cookies()
self._save_cookies()
async def _authenticate(self) -> None:
"""
@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
"""
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
async def _save_cookies(self) -> None:
def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
if not self._shared_auth:
return
if self._shared_auth in shared:
self._shared_cookie_jar_paths = shared[self._shared_auth]
else:
self._shared_cookie_jar_paths = []
shared[self._shared_auth] = self._shared_cookie_jar_paths
self._shared_cookie_jar_paths.append(self._cookie_jar_path)
def _load_cookies(self) -> None:
log.explain_topic("Loading cookies")
cookie_jar_path: Optional[Path] = None
if self._shared_cookie_jar_paths is None:
log.explain("Not sharing any cookies")
cookie_jar_path = self._cookie_jar_path
else:
log.explain("Sharing cookies")
max_mtime: Optional[float] = None
for path in self._shared_cookie_jar_paths:
if not path.is_file():
log.explain(f"{fmt_real_path(path)} is not a file")
continue
mtime = path.stat().st_mtime
if max_mtime is None or mtime > max_mtime:
log.explain(f"{fmt_real_path(path)} has newest mtime so far")
max_mtime = mtime
cookie_jar_path = path
else:
log.explain(f"{fmt_real_path(path)} has older mtime")
if cookie_jar_path is None:
log.explain("Couldn't find a suitable cookie file")
return
log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
try:
self._current_cookie_jar = aiohttp.CookieJar()
self._current_cookie_jar.load(cookie_jar_path)
except Exception as e:
log.explain("Failed to load cookies")
log.explain(str(e))
def _save_cookies(self) -> None:
log.explain_topic("Saving cookies")
if not self._current_cookie_jar:
log.explain("No cookie jar, save aborted")
return
try:
log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
self._current_cookie_jar.save(self._cookie_jar_path)
log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
except Exception:
except Exception as e:
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
log.warn(str(e))
async def run(self) -> None:
self._current_cookie_jar = aiohttp.CookieJar()
self._request_count = 0
try:
self._current_cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
self._load_cookies()
async with aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"},
@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
# They are saved in authenticate, but a final save won't hurt
await self._save_cookies()
self._save_cookies()

View File

@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
config: Config,
authenticators: Dict[str, Authenticator]
):
super().__init__(name, section, config)
# Setting a main authenticator for cookie sharing
auth = section.auth(authenticators)
super().__init__(name, section, config, shared_auth=auth)
self._shibboleth_login = KitShibbolethLogin(
section.auth(authenticators),
section.tfa_auth(authenticators)
auth,
section.tfa_auth(authenticators),
)
self._base_url = "https://ilias.studium.kit.edu"
self._target = section.target()

View File

@ -1,10 +1,11 @@
from pathlib import Path
from typing import Dict, List, Optional
from rich.markup import escape
from .auth import AUTHENTICATORS, Authenticator
from .config import Config, ConfigOptionError
from .crawl import CRAWLERS, Crawler, CrawlError
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
from .logging import log
from .utils import fmt_path
@ -42,6 +43,9 @@ class Pferd:
def _load_crawlers(self) -> List[str]:
names = []
# Cookie sharing
kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
for name, section in self._config.crawler_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
names.append(name)
@ -54,6 +58,10 @@ class Pferd:
crawler = crawler_constructor(name, section, self._config, self._authenticators)
self._crawlers[name] = crawler
if self._config.default_section.share_cookies():
if isinstance(crawler, KitIliasWebCrawler):
crawler.share_cookies(kit_ilias_web_paths)
return names
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: