mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Implement cookie sharing
This commit is contained in:
parent
fca62541ca
commit
c687d4a51a
@ -25,6 +25,11 @@ default values for the other sections.
|
||||
`yes`)
|
||||
- `report`: Whether PFERD should print a report of added, changed and deleted
|
||||
local files for all crawlers before exiting. (Default: `yes`)
|
||||
- `share_cookies`: Whether crawlers should share cookies where applicable. By
|
||||
default, crawlers are isolated and don't interact with each other. This
|
||||
includes their cookies. However, in situations where multiple crawlers crawl
|
||||
the same website using the same account, sharing cookies between crawlers can
|
||||
make sense. (Default: `yes`)
|
||||
|
||||
## The `crawl:*` sections
|
||||
|
||||
|
@ -169,6 +169,11 @@ PARSER.add_argument(
|
||||
action=BooleanOptionalAction,
|
||||
help="print a report of all local changes before exiting"
|
||||
)
|
||||
PARSER.add_argument(
|
||||
"--share-cookies",
|
||||
action=BooleanOptionalAction,
|
||||
help="whether crawlers should share cookies where applicable"
|
||||
)
|
||||
|
||||
|
||||
def load_default_section(
|
||||
@ -180,7 +185,9 @@ def load_default_section(
|
||||
if args.working_dir is not None:
|
||||
section["working_dir"] = str(args.working_dir)
|
||||
if args.explain is not None:
|
||||
section["explain"] = "true" if args.explain else "false"
|
||||
section["explain"] = "yes" if args.explain else "no"
|
||||
if args.share_cookies is not None:
|
||||
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
||||
|
||||
|
||||
SUBPARSERS = PARSER.add_subparsers(title="crawlers")
|
||||
|
@ -81,6 +81,9 @@ class DefaultSection(Section):
|
||||
def report(self) -> bool:
|
||||
return self.s.getboolean("report", fallback=True)
|
||||
|
||||
def share_cookies(self) -> bool:
|
||||
return self.s.getboolean("share_cookies", fallback=True)
|
||||
|
||||
|
||||
class Config:
|
||||
@staticmethod
|
||||
|
@ -1,10 +1,11 @@
|
||||
import asyncio
|
||||
from pathlib import PurePath
|
||||
from typing import Optional
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import aiohttp
|
||||
from aiohttp.client import ClientTimeout
|
||||
|
||||
from ..auth import Authenticator
|
||||
from ..config import Config
|
||||
from ..logging import log
|
||||
from ..utils import fmt_real_path
|
||||
@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
|
||||
name: str,
|
||||
section: HttpCrawlerSection,
|
||||
config: Config,
|
||||
shared_auth: Optional[Authenticator] = None,
|
||||
) -> None:
|
||||
super().__init__(name, section, config)
|
||||
|
||||
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
||||
self._output_dir.register_reserved(self.COOKIE_FILE)
|
||||
self._authentication_id = 0
|
||||
self._authentication_lock = asyncio.Lock()
|
||||
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
|
||||
self._request_count = 0
|
||||
self._http_timeout = section.http_timeout()
|
||||
|
||||
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
||||
self._shared_cookie_jar_paths: Optional[List[Path]] = None
|
||||
self._shared_auth = shared_auth
|
||||
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
|
||||
|
||||
self._output_dir.register_reserved(self.COOKIE_FILE)
|
||||
|
||||
async def _current_auth_id(self) -> int:
|
||||
"""
|
||||
Returns the id for the current authentication, i.e. an identifier for the last
|
||||
@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
|
||||
self._authentication_id += 1
|
||||
# Saving the cookies after the first auth ensures we won't need to re-authenticate
|
||||
# on the next run, should this one be aborted or crash
|
||||
await self._save_cookies()
|
||||
self._save_cookies()
|
||||
|
||||
async def _authenticate(self) -> None:
|
||||
"""
|
||||
@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
|
||||
"""
|
||||
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
|
||||
|
||||
async def _save_cookies(self) -> None:
|
||||
def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
|
||||
if not self._shared_auth:
|
||||
return
|
||||
|
||||
if self._shared_auth in shared:
|
||||
self._shared_cookie_jar_paths = shared[self._shared_auth]
|
||||
else:
|
||||
self._shared_cookie_jar_paths = []
|
||||
shared[self._shared_auth] = self._shared_cookie_jar_paths
|
||||
|
||||
self._shared_cookie_jar_paths.append(self._cookie_jar_path)
|
||||
|
||||
def _load_cookies(self) -> None:
|
||||
log.explain_topic("Loading cookies")
|
||||
cookie_jar_path: Optional[Path] = None
|
||||
|
||||
if self._shared_cookie_jar_paths is None:
|
||||
log.explain("Not sharing any cookies")
|
||||
cookie_jar_path = self._cookie_jar_path
|
||||
else:
|
||||
log.explain("Sharing cookies")
|
||||
max_mtime: Optional[float] = None
|
||||
for path in self._shared_cookie_jar_paths:
|
||||
if not path.is_file():
|
||||
log.explain(f"{fmt_real_path(path)} is not a file")
|
||||
continue
|
||||
mtime = path.stat().st_mtime
|
||||
if max_mtime is None or mtime > max_mtime:
|
||||
log.explain(f"{fmt_real_path(path)} has newest mtime so far")
|
||||
max_mtime = mtime
|
||||
cookie_jar_path = path
|
||||
else:
|
||||
log.explain(f"{fmt_real_path(path)} has older mtime")
|
||||
|
||||
if cookie_jar_path is None:
|
||||
log.explain("Couldn't find a suitable cookie file")
|
||||
return
|
||||
|
||||
log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
|
||||
try:
|
||||
self._current_cookie_jar = aiohttp.CookieJar()
|
||||
self._current_cookie_jar.load(cookie_jar_path)
|
||||
except Exception as e:
|
||||
log.explain("Failed to load cookies")
|
||||
log.explain(str(e))
|
||||
|
||||
def _save_cookies(self) -> None:
|
||||
log.explain_topic("Saving cookies")
|
||||
if not self._current_cookie_jar:
|
||||
log.explain("No cookie jar, save aborted")
|
||||
return
|
||||
|
||||
try:
|
||||
log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
|
||||
self._current_cookie_jar.save(self._cookie_jar_path)
|
||||
log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
|
||||
log.warn(str(e))
|
||||
|
||||
async def run(self) -> None:
|
||||
self._current_cookie_jar = aiohttp.CookieJar()
|
||||
self._request_count = 0
|
||||
|
||||
try:
|
||||
self._current_cookie_jar.load(self._cookie_jar_path)
|
||||
except Exception:
|
||||
pass
|
||||
self._load_cookies()
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
headers={"User-Agent": f"{NAME}/{VERSION}"},
|
||||
@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
|
||||
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
|
||||
|
||||
# They are saved in authenticate, but a final save won't hurt
|
||||
await self._save_cookies()
|
||||
self._save_cookies()
|
||||
|
@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
config: Config,
|
||||
authenticators: Dict[str, Authenticator]
|
||||
):
|
||||
super().__init__(name, section, config)
|
||||
# Setting a main authenticator for cookie sharing
|
||||
auth = section.auth(authenticators)
|
||||
super().__init__(name, section, config, shared_auth=auth)
|
||||
|
||||
self._shibboleth_login = KitShibbolethLogin(
|
||||
section.auth(authenticators),
|
||||
section.tfa_auth(authenticators)
|
||||
auth,
|
||||
section.tfa_auth(authenticators),
|
||||
)
|
||||
|
||||
self._base_url = "https://ilias.studium.kit.edu"
|
||||
|
||||
self._target = section.target()
|
||||
|
@ -1,10 +1,11 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from rich.markup import escape
|
||||
|
||||
from .auth import AUTHENTICATORS, Authenticator
|
||||
from .config import Config, ConfigOptionError
|
||||
from .crawl import CRAWLERS, Crawler, CrawlError
|
||||
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
|
||||
from .logging import log
|
||||
from .utils import fmt_path
|
||||
|
||||
@ -42,6 +43,9 @@ class Pferd:
|
||||
def _load_crawlers(self) -> List[str]:
|
||||
names = []
|
||||
|
||||
# Cookie sharing
|
||||
kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
|
||||
|
||||
for name, section in self._config.crawler_sections():
|
||||
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
||||
names.append(name)
|
||||
@ -54,6 +58,10 @@ class Pferd:
|
||||
crawler = crawler_constructor(name, section, self._config, self._authenticators)
|
||||
self._crawlers[name] = crawler
|
||||
|
||||
if self._config.default_section.share_cookies():
|
||||
if isinstance(crawler, KitIliasWebCrawler):
|
||||
crawler.share_cookies(kit_ilias_web_paths)
|
||||
|
||||
return names
|
||||
|
||||
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]:
|
||||
|
Loading…
Reference in New Issue
Block a user