Implement cookie sharing

2025-10-22 01:32:32 +02:00 · 2021-05-24 13:10:19 +02:00
parent fca62541ca
commit c687d4a51a
6 changed files with 95 additions and 21 deletions
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -25,6 +25,11 @@ default values for the other sections.
   `yes`)
 - `report`: Whether PFERD should print a report of added, changed and deleted
   local files for all crawlers before exiting. (Default: `yes`)
 - `share_cookies`: Whether crawlers should share cookies where applicable. By
  default, crawlers are isolated and don't interact with each other. This
  includes their cookies. However, in situations where multiple crawlers crawl
  the same website using the same account, sharing cookies between crawlers can
  make sense. (Default: `yes`)
 ## The `crawl:*` sections
--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@@ -169,6 +169,11 @@ PARSER.add_argument(
    action=BooleanOptionalAction,
    help="print a report of all local changes before exiting"
 )
 PARSER.add_argument(
    "--share-cookies",
    action=BooleanOptionalAction,
    help="whether crawlers should share cookies where applicable"
 )
 def load_default_section(
@@ -180,7 +185,9 @@ def load_default_section(
    if args.working_dir is not None:
        section["working_dir"] = str(args.working_dir)
    if args.explain is not None:
-        section["explain"] = "true" if args.explain else "false"
+        section["explain"] = "yes" if args.explain else "no"
    if args.share_cookies is not None:
        section["share_cookies"] = "yes" if args.share_cookies else "no"
 SUBPARSERS = PARSER.add_subparsers(title="crawlers")
--- a/PFERD/config.py
+++ b/PFERD/config.py
@@ -81,6 +81,9 @@ class DefaultSection(Section):
    def report(self) -> bool:
        return self.s.getboolean("report", fallback=True)
    def share_cookies(self) -> bool:
        return self.s.getboolean("share_cookies", fallback=True)
 class Config:
    @staticmethod
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@@ -1,10 +1,11 @@
 import asyncio
-from pathlib import PurePath
+from pathlib import Path, PurePath
-from typing import Optional
+from typing import Dict, List, Optional
 import aiohttp
 from aiohttp.client import ClientTimeout
 from ..auth import Authenticator
 from ..config import Config
 from ..logging import log
 from ..utils import fmt_real_path
@@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
            name: str,
            section: HttpCrawlerSection,
            config: Config,
            shared_auth: Optional[Authenticator] = None,
    ) -> None:
        super().__init__(name, section, config)
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._output_dir.register_reserved(self.COOKIE_FILE)
        self._authentication_id = 0
        self._authentication_lock = asyncio.Lock()
        self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
        self._request_count = 0
        self._http_timeout = section.http_timeout()
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._shared_cookie_jar_paths: Optional[List[Path]] = None
        self._shared_auth = shared_auth
        self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
        self._output_dir.register_reserved(self.COOKIE_FILE)
    async def _current_auth_id(self) -> int:
        """
        Returns the id for the current authentication, i.e. an identifier for the last
@@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
            self._authentication_id += 1
            # Saving the cookies after the first auth ensures we won't need to re-authenticate
            # on the next run, should this one be aborted or crash
-            await self._save_cookies()
+            self._save_cookies()
    async def _authenticate(self) -> None:
        """
@@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
        """
        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
-    async def _save_cookies(self) -> None:
+    def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
        if not self._shared_auth:
            return
        if self._shared_auth in shared:
            self._shared_cookie_jar_paths = shared[self._shared_auth]
        else:
            self._shared_cookie_jar_paths = []
            shared[self._shared_auth] = self._shared_cookie_jar_paths
        self._shared_cookie_jar_paths.append(self._cookie_jar_path)
    def _load_cookies(self) -> None:
        log.explain_topic("Loading cookies")
        cookie_jar_path: Optional[Path] = None
        if self._shared_cookie_jar_paths is None:
            log.explain("Not sharing any cookies")
            cookie_jar_path = self._cookie_jar_path
        else:
            log.explain("Sharing cookies")
            max_mtime: Optional[float] = None
            for path in self._shared_cookie_jar_paths:
                if not path.is_file():
                    log.explain(f"{fmt_real_path(path)} is not a file")
                    continue
                mtime = path.stat().st_mtime
                if max_mtime is None or mtime > max_mtime:
                    log.explain(f"{fmt_real_path(path)} has newest mtime so far")
                    max_mtime = mtime
                    cookie_jar_path = path
                else:
                    log.explain(f"{fmt_real_path(path)} has older mtime")
        if cookie_jar_path is None:
            log.explain("Couldn't find a suitable cookie file")
            return
        log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
        try:
            self._current_cookie_jar = aiohttp.CookieJar()
            self._current_cookie_jar.load(cookie_jar_path)
        except Exception as e:
            log.explain("Failed to load cookies")
            log.explain(str(e))
    def _save_cookies(self) -> None:
        log.explain_topic("Saving cookies")
        if not self._current_cookie_jar:
            log.explain("No cookie jar, save aborted")
            return
        try:
            log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
            self._current_cookie_jar.save(self._cookie_jar_path)
-            log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
+        except Exception as e:
        except Exception:
            log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
            log.warn(str(e))
    async def run(self) -> None:
        self._current_cookie_jar = aiohttp.CookieJar()
        self._request_count = 0
-
+        self._load_cookies()
        try:
            self._current_cookie_jar.load(self._cookie_jar_path)
        except Exception:
            pass
        async with aiohttp.ClientSession(
                headers={"User-Agent": f"{NAME}/{VERSION}"},
@@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
        log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
        # They are saved in authenticate, but a final save won't hurt
-        await self._save_cookies()
+        self._save_cookies()
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
            config: Config,
            authenticators: Dict[str, Authenticator]
    ):
-        super().__init__(name, section, config)
+        # Setting a main authenticator for cookie sharing
        auth = section.auth(authenticators)
        super().__init__(name, section, config, shared_auth=auth)
        self._shibboleth_login = KitShibbolethLogin(
-            section.auth(authenticators),
+            auth,
-            section.tfa_auth(authenticators)
+            section.tfa_auth(authenticators),
        )
        self._base_url = "https://ilias.studium.kit.edu"
        self._target = section.target()
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,10 +1,11 @@
 from pathlib import Path
 from typing import Dict, List, Optional
 from rich.markup import escape
 from .auth import AUTHENTICATORS, Authenticator
 from .config import Config, ConfigOptionError
-from .crawl import CRAWLERS, Crawler, CrawlError
+from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
 from .logging import log
 from .utils import fmt_path
@@ -42,6 +43,9 @@ class Pferd:
    def _load_crawlers(self) -> List[str]:
        names = []
        # Cookie sharing
        kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
        for name, section in self._config.crawler_sections():
            log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
            names.append(name)
@@ -54,6 +58,10 @@ class Pferd:
            crawler = crawler_constructor(name, section, self._config, self._authenticators)
            self._crawlers[name] = crawler
            if self._config.default_section.share_cookies():
                if isinstance(crawler, KitIliasWebCrawler):
                    crawler.share_cookies(kit_ilias_web_paths)
        return names
    def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: