Implement cookie sharing

This commit is contained in:
Joscha 2021-05-24 13:10:19 +02:00
parent fca62541ca
commit c687d4a51a
6 changed files with 95 additions and 21 deletions

View File

@ -25,6 +25,11 @@ default values for the other sections.
`yes`) `yes`)
- `report`: Whether PFERD should print a report of added, changed and deleted - `report`: Whether PFERD should print a report of added, changed and deleted
local files for all crawlers before exiting. (Default: `yes`) local files for all crawlers before exiting. (Default: `yes`)
- `share_cookies`: Whether crawlers should share cookies where applicable. By
default, crawlers are isolated and don't interact with each other. This
includes their cookies. However, in situations where multiple crawlers crawl
the same website using the same account, sharing cookies between crawlers can
make sense. (Default: `yes`)
## The `crawl:*` sections ## The `crawl:*` sections

View File

@ -169,6 +169,11 @@ PARSER.add_argument(
action=BooleanOptionalAction, action=BooleanOptionalAction,
help="print a report of all local changes before exiting" help="print a report of all local changes before exiting"
) )
PARSER.add_argument(
"--share-cookies",
action=BooleanOptionalAction,
help="whether crawlers should share cookies where applicable"
)
def load_default_section( def load_default_section(
@ -180,7 +185,9 @@ def load_default_section(
if args.working_dir is not None: if args.working_dir is not None:
section["working_dir"] = str(args.working_dir) section["working_dir"] = str(args.working_dir)
if args.explain is not None: if args.explain is not None:
section["explain"] = "true" if args.explain else "false" section["explain"] = "yes" if args.explain else "no"
if args.share_cookies is not None:
section["share_cookies"] = "yes" if args.share_cookies else "no"
SUBPARSERS = PARSER.add_subparsers(title="crawlers") SUBPARSERS = PARSER.add_subparsers(title="crawlers")

View File

@ -81,6 +81,9 @@ class DefaultSection(Section):
def report(self) -> bool: def report(self) -> bool:
return self.s.getboolean("report", fallback=True) return self.s.getboolean("report", fallback=True)
def share_cookies(self) -> bool:
return self.s.getboolean("share_cookies", fallback=True)
class Config: class Config:
@staticmethod @staticmethod

View File

@ -1,10 +1,11 @@
import asyncio import asyncio
from pathlib import PurePath from pathlib import Path, PurePath
from typing import Optional from typing import Dict, List, Optional
import aiohttp import aiohttp
from aiohttp.client import ClientTimeout from aiohttp.client import ClientTimeout
from ..auth import Authenticator
from ..config import Config from ..config import Config
from ..logging import log from ..logging import log
from ..utils import fmt_real_path from ..utils import fmt_real_path
@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
name: str, name: str,
section: HttpCrawlerSection, section: HttpCrawlerSection,
config: Config, config: Config,
shared_auth: Optional[Authenticator] = None,
) -> None: ) -> None:
super().__init__(name, section, config) super().__init__(name, section, config)
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._output_dir.register_reserved(self.COOKIE_FILE)
self._authentication_id = 0 self._authentication_id = 0
self._authentication_lock = asyncio.Lock() self._authentication_lock = asyncio.Lock()
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
self._request_count = 0 self._request_count = 0
self._http_timeout = section.http_timeout() self._http_timeout = section.http_timeout()
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._shared_cookie_jar_paths: Optional[List[Path]] = None
self._shared_auth = shared_auth
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
self._output_dir.register_reserved(self.COOKIE_FILE)
async def _current_auth_id(self) -> int: async def _current_auth_id(self) -> int:
""" """
Returns the id for the current authentication, i.e. an identifier for the last Returns the id for the current authentication, i.e. an identifier for the last
@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
self._authentication_id += 1 self._authentication_id += 1
# Saving the cookies after the first auth ensures we won't need to re-authenticate # Saving the cookies after the first auth ensures we won't need to re-authenticate
# on the next run, should this one be aborted or crash # on the next run, should this one be aborted or crash
await self._save_cookies() self._save_cookies()
async def _authenticate(self) -> None: async def _authenticate(self) -> None:
""" """
@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
""" """
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
async def _save_cookies(self) -> None: def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
if not self._shared_auth:
return
if self._shared_auth in shared:
self._shared_cookie_jar_paths = shared[self._shared_auth]
else:
self._shared_cookie_jar_paths = []
shared[self._shared_auth] = self._shared_cookie_jar_paths
self._shared_cookie_jar_paths.append(self._cookie_jar_path)
def _load_cookies(self) -> None:
log.explain_topic("Loading cookies")
cookie_jar_path: Optional[Path] = None
if self._shared_cookie_jar_paths is None:
log.explain("Not sharing any cookies")
cookie_jar_path = self._cookie_jar_path
else:
log.explain("Sharing cookies")
max_mtime: Optional[float] = None
for path in self._shared_cookie_jar_paths:
if not path.is_file():
log.explain(f"{fmt_real_path(path)} is not a file")
continue
mtime = path.stat().st_mtime
if max_mtime is None or mtime > max_mtime:
log.explain(f"{fmt_real_path(path)} has newest mtime so far")
max_mtime = mtime
cookie_jar_path = path
else:
log.explain(f"{fmt_real_path(path)} has older mtime")
if cookie_jar_path is None:
log.explain("Couldn't find a suitable cookie file")
return
log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
try:
self._current_cookie_jar = aiohttp.CookieJar()
self._current_cookie_jar.load(cookie_jar_path)
except Exception as e:
log.explain("Failed to load cookies")
log.explain(str(e))
def _save_cookies(self) -> None:
log.explain_topic("Saving cookies") log.explain_topic("Saving cookies")
if not self._current_cookie_jar: if not self._current_cookie_jar:
log.explain("No cookie jar, save aborted") log.explain("No cookie jar, save aborted")
return return
try: try:
log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
self._current_cookie_jar.save(self._cookie_jar_path) self._current_cookie_jar.save(self._cookie_jar_path)
log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}") except Exception as e:
except Exception:
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
log.warn(str(e))
async def run(self) -> None: async def run(self) -> None:
self._current_cookie_jar = aiohttp.CookieJar()
self._request_count = 0 self._request_count = 0
self._load_cookies()
try:
self._current_cookie_jar.load(self._cookie_jar_path)
except Exception:
pass
async with aiohttp.ClientSession( async with aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"}, headers={"User-Agent": f"{NAME}/{VERSION}"},
@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
# They are saved in authenticate, but a final save won't hurt # They are saved in authenticate, but a final save won't hurt
await self._save_cookies() self._save_cookies()

View File

@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
config: Config, config: Config,
authenticators: Dict[str, Authenticator] authenticators: Dict[str, Authenticator]
): ):
super().__init__(name, section, config) # Setting a main authenticator for cookie sharing
auth = section.auth(authenticators)
super().__init__(name, section, config, shared_auth=auth)
self._shibboleth_login = KitShibbolethLogin( self._shibboleth_login = KitShibbolethLogin(
section.auth(authenticators), auth,
section.tfa_auth(authenticators) section.tfa_auth(authenticators),
) )
self._base_url = "https://ilias.studium.kit.edu" self._base_url = "https://ilias.studium.kit.edu"
self._target = section.target() self._target = section.target()

View File

@ -1,10 +1,11 @@
from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional
from rich.markup import escape from rich.markup import escape
from .auth import AUTHENTICATORS, Authenticator from .auth import AUTHENTICATORS, Authenticator
from .config import Config, ConfigOptionError from .config import Config, ConfigOptionError
from .crawl import CRAWLERS, Crawler, CrawlError from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
from .logging import log from .logging import log
from .utils import fmt_path from .utils import fmt_path
@ -42,6 +43,9 @@ class Pferd:
def _load_crawlers(self) -> List[str]: def _load_crawlers(self) -> List[str]:
names = [] names = []
# Cookie sharing
kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
for name, section in self._config.crawler_sections(): for name, section in self._config.crawler_sections():
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
names.append(name) names.append(name)
@ -54,6 +58,10 @@ class Pferd:
crawler = crawler_constructor(name, section, self._config, self._authenticators) crawler = crawler_constructor(name, section, self._config, self._authenticators)
self._crawlers[name] = crawler self._crawlers[name] = crawler
if self._config.default_section.share_cookies():
if isinstance(crawler, KitIliasWebCrawler):
crawler.share_cookies(kit_ilias_web_paths)
return names return names
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: