Implement cookie sharing

2025-07-20 01:42:37 +02:00 · 2021-05-24 13:10:19 +02:00
parent fca62541ca
commit c687d4a51a
6 changed files with 95 additions and 21 deletions
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -25,6 +25,11 @@ default values for the other sections.
   `yes`)
 - `report`: Whether PFERD should print a report of added, changed and deleted
   local files for all crawlers before exiting. (Default: `yes`)
+- `share_cookies`: Whether crawlers should share cookies where applicable. By
+  default, crawlers are isolated and don't interact with each other. This
+  includes their cookies. However, in situations where multiple crawlers crawl
+  the same website using the same account, sharing cookies between crawlers can
+  make sense. (Default: `yes`)

 ## The `crawl:*` sections

--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@@ -169,6 +169,11 @@ PARSER.add_argument(
    action=BooleanOptionalAction,
    help="print a report of all local changes before exiting"
 )
+PARSER.add_argument(
+    "--share-cookies",
+    action=BooleanOptionalAction,
+    help="whether crawlers should share cookies where applicable"
+)


 def load_default_section(
@@ -180,7 +185,9 @@ def load_default_section(
    if args.working_dir is not None:
        section["working_dir"] = str(args.working_dir)
    if args.explain is not None:
-        section["explain"] = "true" if args.explain else "false"
+        section["explain"] = "yes" if args.explain else "no"
+    if args.share_cookies is not None:
+        section["share_cookies"] = "yes" if args.share_cookies else "no"


 SUBPARSERS = PARSER.add_subparsers(title="crawlers")
--- a/PFERD/config.py
+++ b/PFERD/config.py
@@ -81,6 +81,9 @@ class DefaultSection(Section):
    def report(self) -> bool:
        return self.s.getboolean("report", fallback=True)

+    def share_cookies(self) -> bool:
+        return self.s.getboolean("share_cookies", fallback=True)
+

 class Config:
    @staticmethod
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@@ -1,10 +1,11 @@
 import asyncio
-from pathlib import PurePath
-from typing import Optional
+from pathlib import Path, PurePath
+from typing import Dict, List, Optional

 import aiohttp
 from aiohttp.client import ClientTimeout

+from ..auth import Authenticator
 from ..config import Config
 from ..logging import log
 from ..utils import fmt_real_path
@@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
            name: str,
            section: HttpCrawlerSection,
            config: Config,
+            shared_auth: Optional[Authenticator] = None,
    ) -> None:
        super().__init__(name, section, config)

-        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
-        self._output_dir.register_reserved(self.COOKIE_FILE)
        self._authentication_id = 0
        self._authentication_lock = asyncio.Lock()
-        self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
        self._request_count = 0
        self._http_timeout = section.http_timeout()

+        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
+        self._shared_cookie_jar_paths: Optional[List[Path]] = None
+        self._shared_auth = shared_auth
+        self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
+
+        self._output_dir.register_reserved(self.COOKIE_FILE)
+
    async def _current_auth_id(self) -> int:
        """
        Returns the id for the current authentication, i.e. an identifier for the last
@@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
            self._authentication_id += 1
            # Saving the cookies after the first auth ensures we won't need to re-authenticate
            # on the next run, should this one be aborted or crash
-            await self._save_cookies()
+            self._save_cookies()

    async def _authenticate(self) -> None:
        """
@@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
        """
        raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")

-    async def _save_cookies(self) -> None:
+    def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
+        if not self._shared_auth:
+            return
+
+        if self._shared_auth in shared:
+            self._shared_cookie_jar_paths = shared[self._shared_auth]
+        else:
+            self._shared_cookie_jar_paths = []
+            shared[self._shared_auth] = self._shared_cookie_jar_paths
+
+        self._shared_cookie_jar_paths.append(self._cookie_jar_path)
+
+    def _load_cookies(self) -> None:
+        log.explain_topic("Loading cookies")
+        cookie_jar_path: Optional[Path] = None
+
+        if self._shared_cookie_jar_paths is None:
+            log.explain("Not sharing any cookies")
+            cookie_jar_path = self._cookie_jar_path
+        else:
+            log.explain("Sharing cookies")
+            max_mtime: Optional[float] = None
+            for path in self._shared_cookie_jar_paths:
+                if not path.is_file():
+                    log.explain(f"{fmt_real_path(path)} is not a file")
+                    continue
+                mtime = path.stat().st_mtime
+                if max_mtime is None or mtime > max_mtime:
+                    log.explain(f"{fmt_real_path(path)} has newest mtime so far")
+                    max_mtime = mtime
+                    cookie_jar_path = path
+                else:
+                    log.explain(f"{fmt_real_path(path)} has older mtime")
+
+        if cookie_jar_path is None:
+            log.explain("Couldn't find a suitable cookie file")
+            return
+
+        log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
+        try:
+            self._current_cookie_jar = aiohttp.CookieJar()
+            self._current_cookie_jar.load(cookie_jar_path)
+        except Exception as e:
+            log.explain("Failed to load cookies")
+            log.explain(str(e))
+
+    def _save_cookies(self) -> None:
        log.explain_topic("Saving cookies")
        if not self._current_cookie_jar:
            log.explain("No cookie jar, save aborted")
            return

        try:
+            log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
            self._current_cookie_jar.save(self._cookie_jar_path)
-            log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
-        except Exception:
+        except Exception as e:
            log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
+            log.warn(str(e))

    async def run(self) -> None:
-        self._current_cookie_jar = aiohttp.CookieJar()
        self._request_count = 0
-
-        try:
-            self._current_cookie_jar.load(self._cookie_jar_path)
-        except Exception:
-            pass
+        self._load_cookies()

        async with aiohttp.ClientSession(
                headers={"User-Agent": f"{NAME}/{VERSION}"},
@@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
        log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")

        # They are saved in authenticate, but a final save won't hurt
-        await self._save_cookies()
+        self._save_cookies()
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
            config: Config,
            authenticators: Dict[str, Authenticator]
    ):
-        super().__init__(name, section, config)
+        # Setting a main authenticator for cookie sharing
+        auth = section.auth(authenticators)
+        super().__init__(name, section, config, shared_auth=auth)

        self._shibboleth_login = KitShibbolethLogin(
-            section.auth(authenticators),
-            section.tfa_auth(authenticators)
+            auth,
+            section.tfa_auth(authenticators),
        )
+
        self._base_url = "https://ilias.studium.kit.edu"

        self._target = section.target()
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,10 +1,11 @@
+from pathlib import Path
 from typing import Dict, List, Optional

 from rich.markup import escape

 from .auth import AUTHENTICATORS, Authenticator
 from .config import Config, ConfigOptionError
-from .crawl import CRAWLERS, Crawler, CrawlError
+from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
 from .logging import log
 from .utils import fmt_path

@@ -42,6 +43,9 @@ class Pferd:
    def _load_crawlers(self) -> List[str]:
        names = []

+        # Cookie sharing
+        kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
+
        for name, section in self._config.crawler_sections():
            log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
            names.append(name)
@@ -54,6 +58,10 @@ class Pferd:
            crawler = crawler_constructor(name, section, self._config, self._authenticators)
            self._crawlers[name] = crawler

+            if self._config.default_section.share_cookies():
+                if isinstance(crawler, KitIliasWebCrawler):
+                    crawler.share_cookies(kit_ilias_web_paths)
+
        return names

    def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: