mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Implement cookie sharing
This commit is contained in:
parent
fca62541ca
commit
c687d4a51a
@ -25,6 +25,11 @@ default values for the other sections.
|
|||||||
`yes`)
|
`yes`)
|
||||||
- `report`: Whether PFERD should print a report of added, changed and deleted
|
- `report`: Whether PFERD should print a report of added, changed and deleted
|
||||||
local files for all crawlers before exiting. (Default: `yes`)
|
local files for all crawlers before exiting. (Default: `yes`)
|
||||||
|
- `share_cookies`: Whether crawlers should share cookies where applicable. By
|
||||||
|
default, crawlers are isolated and don't interact with each other. This
|
||||||
|
includes their cookies. However, in situations where multiple crawlers crawl
|
||||||
|
the same website using the same account, sharing cookies between crawlers can
|
||||||
|
make sense. (Default: `yes`)
|
||||||
|
|
||||||
## The `crawl:*` sections
|
## The `crawl:*` sections
|
||||||
|
|
||||||
|
@ -169,6 +169,11 @@ PARSER.add_argument(
|
|||||||
action=BooleanOptionalAction,
|
action=BooleanOptionalAction,
|
||||||
help="print a report of all local changes before exiting"
|
help="print a report of all local changes before exiting"
|
||||||
)
|
)
|
||||||
|
PARSER.add_argument(
|
||||||
|
"--share-cookies",
|
||||||
|
action=BooleanOptionalAction,
|
||||||
|
help="whether crawlers should share cookies where applicable"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_default_section(
|
def load_default_section(
|
||||||
@ -180,7 +185,9 @@ def load_default_section(
|
|||||||
if args.working_dir is not None:
|
if args.working_dir is not None:
|
||||||
section["working_dir"] = str(args.working_dir)
|
section["working_dir"] = str(args.working_dir)
|
||||||
if args.explain is not None:
|
if args.explain is not None:
|
||||||
section["explain"] = "true" if args.explain else "false"
|
section["explain"] = "yes" if args.explain else "no"
|
||||||
|
if args.share_cookies is not None:
|
||||||
|
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
||||||
|
|
||||||
|
|
||||||
SUBPARSERS = PARSER.add_subparsers(title="crawlers")
|
SUBPARSERS = PARSER.add_subparsers(title="crawlers")
|
||||||
|
@ -81,6 +81,9 @@ class DefaultSection(Section):
|
|||||||
def report(self) -> bool:
|
def report(self) -> bool:
|
||||||
return self.s.getboolean("report", fallback=True)
|
return self.s.getboolean("report", fallback=True)
|
||||||
|
|
||||||
|
def share_cookies(self) -> bool:
|
||||||
|
return self.s.getboolean("share_cookies", fallback=True)
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from aiohttp.client import ClientTimeout
|
from aiohttp.client import ClientTimeout
|
||||||
|
|
||||||
|
from ..auth import Authenticator
|
||||||
from ..config import Config
|
from ..config import Config
|
||||||
from ..logging import log
|
from ..logging import log
|
||||||
from ..utils import fmt_real_path
|
from ..utils import fmt_real_path
|
||||||
@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
|
|||||||
name: str,
|
name: str,
|
||||||
section: HttpCrawlerSection,
|
section: HttpCrawlerSection,
|
||||||
config: Config,
|
config: Config,
|
||||||
|
shared_auth: Optional[Authenticator] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(name, section, config)
|
super().__init__(name, section, config)
|
||||||
|
|
||||||
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
|
||||||
self._output_dir.register_reserved(self.COOKIE_FILE)
|
|
||||||
self._authentication_id = 0
|
self._authentication_id = 0
|
||||||
self._authentication_lock = asyncio.Lock()
|
self._authentication_lock = asyncio.Lock()
|
||||||
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
|
|
||||||
self._request_count = 0
|
self._request_count = 0
|
||||||
self._http_timeout = section.http_timeout()
|
self._http_timeout = section.http_timeout()
|
||||||
|
|
||||||
|
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
||||||
|
self._shared_cookie_jar_paths: Optional[List[Path]] = None
|
||||||
|
self._shared_auth = shared_auth
|
||||||
|
self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
|
||||||
|
|
||||||
|
self._output_dir.register_reserved(self.COOKIE_FILE)
|
||||||
|
|
||||||
async def _current_auth_id(self) -> int:
|
async def _current_auth_id(self) -> int:
|
||||||
"""
|
"""
|
||||||
Returns the id for the current authentication, i.e. an identifier for the last
|
Returns the id for the current authentication, i.e. an identifier for the last
|
||||||
@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
|
|||||||
self._authentication_id += 1
|
self._authentication_id += 1
|
||||||
# Saving the cookies after the first auth ensures we won't need to re-authenticate
|
# Saving the cookies after the first auth ensures we won't need to re-authenticate
|
||||||
# on the next run, should this one be aborted or crash
|
# on the next run, should this one be aborted or crash
|
||||||
await self._save_cookies()
|
self._save_cookies()
|
||||||
|
|
||||||
async def _authenticate(self) -> None:
|
async def _authenticate(self) -> None:
|
||||||
"""
|
"""
|
||||||
@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
|
|||||||
"""
|
"""
|
||||||
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
|
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
|
||||||
|
|
||||||
async def _save_cookies(self) -> None:
|
def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
|
||||||
|
if not self._shared_auth:
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._shared_auth in shared:
|
||||||
|
self._shared_cookie_jar_paths = shared[self._shared_auth]
|
||||||
|
else:
|
||||||
|
self._shared_cookie_jar_paths = []
|
||||||
|
shared[self._shared_auth] = self._shared_cookie_jar_paths
|
||||||
|
|
||||||
|
self._shared_cookie_jar_paths.append(self._cookie_jar_path)
|
||||||
|
|
||||||
|
def _load_cookies(self) -> None:
|
||||||
|
log.explain_topic("Loading cookies")
|
||||||
|
cookie_jar_path: Optional[Path] = None
|
||||||
|
|
||||||
|
if self._shared_cookie_jar_paths is None:
|
||||||
|
log.explain("Not sharing any cookies")
|
||||||
|
cookie_jar_path = self._cookie_jar_path
|
||||||
|
else:
|
||||||
|
log.explain("Sharing cookies")
|
||||||
|
max_mtime: Optional[float] = None
|
||||||
|
for path in self._shared_cookie_jar_paths:
|
||||||
|
if not path.is_file():
|
||||||
|
log.explain(f"{fmt_real_path(path)} is not a file")
|
||||||
|
continue
|
||||||
|
mtime = path.stat().st_mtime
|
||||||
|
if max_mtime is None or mtime > max_mtime:
|
||||||
|
log.explain(f"{fmt_real_path(path)} has newest mtime so far")
|
||||||
|
max_mtime = mtime
|
||||||
|
cookie_jar_path = path
|
||||||
|
else:
|
||||||
|
log.explain(f"{fmt_real_path(path)} has older mtime")
|
||||||
|
|
||||||
|
if cookie_jar_path is None:
|
||||||
|
log.explain("Couldn't find a suitable cookie file")
|
||||||
|
return
|
||||||
|
|
||||||
|
log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
|
||||||
|
try:
|
||||||
|
self._current_cookie_jar = aiohttp.CookieJar()
|
||||||
|
self._current_cookie_jar.load(cookie_jar_path)
|
||||||
|
except Exception as e:
|
||||||
|
log.explain("Failed to load cookies")
|
||||||
|
log.explain(str(e))
|
||||||
|
|
||||||
|
def _save_cookies(self) -> None:
|
||||||
log.explain_topic("Saving cookies")
|
log.explain_topic("Saving cookies")
|
||||||
if not self._current_cookie_jar:
|
if not self._current_cookie_jar:
|
||||||
log.explain("No cookie jar, save aborted")
|
log.explain("No cookie jar, save aborted")
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
|
||||||
self._current_cookie_jar.save(self._cookie_jar_path)
|
self._current_cookie_jar.save(self._cookie_jar_path)
|
||||||
log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
|
except Exception as e:
|
||||||
except Exception:
|
|
||||||
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
|
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
|
||||||
|
log.warn(str(e))
|
||||||
|
|
||||||
async def run(self) -> None:
|
async def run(self) -> None:
|
||||||
self._current_cookie_jar = aiohttp.CookieJar()
|
|
||||||
self._request_count = 0
|
self._request_count = 0
|
||||||
|
self._load_cookies()
|
||||||
try:
|
|
||||||
self._current_cookie_jar.load(self._cookie_jar_path)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(
|
||||||
headers={"User-Agent": f"{NAME}/{VERSION}"},
|
headers={"User-Agent": f"{NAME}/{VERSION}"},
|
||||||
@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
|
|||||||
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
|
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
|
||||||
|
|
||||||
# They are saved in authenticate, but a final save won't hurt
|
# They are saved in authenticate, but a final save won't hurt
|
||||||
await self._save_cookies()
|
self._save_cookies()
|
||||||
|
@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
config: Config,
|
config: Config,
|
||||||
authenticators: Dict[str, Authenticator]
|
authenticators: Dict[str, Authenticator]
|
||||||
):
|
):
|
||||||
super().__init__(name, section, config)
|
# Setting a main authenticator for cookie sharing
|
||||||
|
auth = section.auth(authenticators)
|
||||||
|
super().__init__(name, section, config, shared_auth=auth)
|
||||||
|
|
||||||
self._shibboleth_login = KitShibbolethLogin(
|
self._shibboleth_login = KitShibbolethLogin(
|
||||||
section.auth(authenticators),
|
auth,
|
||||||
section.tfa_auth(authenticators)
|
section.tfa_auth(authenticators),
|
||||||
)
|
)
|
||||||
|
|
||||||
self._base_url = "https://ilias.studium.kit.edu"
|
self._base_url = "https://ilias.studium.kit.edu"
|
||||||
|
|
||||||
self._target = section.target()
|
self._target = section.target()
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
from .auth import AUTHENTICATORS, Authenticator
|
from .auth import AUTHENTICATORS, Authenticator
|
||||||
from .config import Config, ConfigOptionError
|
from .config import Config, ConfigOptionError
|
||||||
from .crawl import CRAWLERS, Crawler, CrawlError
|
from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
|
||||||
from .logging import log
|
from .logging import log
|
||||||
from .utils import fmt_path
|
from .utils import fmt_path
|
||||||
|
|
||||||
@ -42,6 +43,9 @@ class Pferd:
|
|||||||
def _load_crawlers(self) -> List[str]:
|
def _load_crawlers(self) -> List[str]:
|
||||||
names = []
|
names = []
|
||||||
|
|
||||||
|
# Cookie sharing
|
||||||
|
kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
|
||||||
|
|
||||||
for name, section in self._config.crawler_sections():
|
for name, section in self._config.crawler_sections():
|
||||||
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
|
||||||
names.append(name)
|
names.append(name)
|
||||||
@ -54,6 +58,10 @@ class Pferd:
|
|||||||
crawler = crawler_constructor(name, section, self._config, self._authenticators)
|
crawler = crawler_constructor(name, section, self._config, self._authenticators)
|
||||||
self._crawlers[name] = crawler
|
self._crawlers[name] = crawler
|
||||||
|
|
||||||
|
if self._config.default_section.share_cookies():
|
||||||
|
if isinstance(crawler, KitIliasWebCrawler):
|
||||||
|
crawler.share_cookies(kit_ilias_web_paths)
|
||||||
|
|
||||||
return names
|
return names
|
||||||
|
|
||||||
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]:
|
def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]:
|
||||||
|
Loading…
Reference in New Issue
Block a user