From 2de4255a789c2446abdee9f3fe3bdc2f722cf057 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 23 Apr 2020 09:44:13 +0000 Subject: [PATCH] Add Pferd class --- PFERD/__init__.py | 2 ++ PFERD/downloaders.py | 3 +- PFERD/ilias/__init__.py | 5 +-- PFERD/ilias/authenticators.py | 3 ++ PFERD/ilias/crawler.py | 34 ++++++++++-------- PFERD/ilias/downloader.py | 19 +++++++--- PFERD/pferd.py | 65 +++++++++++++++++++++++++++++++++++ PFERD/tmp_dir.py | 16 ++++++--- 8 files changed, 118 insertions(+), 29 deletions(-) create mode 100644 PFERD/pferd.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 8efbf95..87131ce 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -7,6 +7,8 @@ more complex configuration, you need to import the other submodules manually. import logging +from .pferd import Pferd + STYLE = "{" FORMAT = "[{levelname:<7}] {message}" DATE_FORMAT = "%F %T" diff --git a/PFERD/downloaders.py b/PFERD/downloaders.py index 5d193f8..48a82ee 100644 --- a/PFERD/downloaders.py +++ b/PFERD/downloaders.py @@ -3,7 +3,6 @@ General downloaders useful in many situations """ from dataclasses import dataclass, field -from pathlib import Path from typing import Any, Dict, List, Optional import requests @@ -67,7 +66,7 @@ class HttpDownloader: with self._session.get(info.url, params=info.parameters, stream=True) as response: if response.status_code == 200: - tmp_file = self._tmp_dir.new_file() + tmp_file = self._tmp_dir.new_path() stream_to_path(response, tmp_file) self._organizer.accept_file(tmp_file, info.path) else: diff --git a/PFERD/ilias/__init__.py b/PFERD/ilias/__init__.py index ee3cccc..3a983d0 100644 --- a/PFERD/ilias/__init__.py +++ b/PFERD/ilias/__init__.py @@ -2,5 +2,6 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/). """ -from .authenticators import * -from .downloader import * +from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator +from .crawler import IliasCrawler, IliasFilter +from .downloader import IliasDownloader diff --git a/PFERD/ilias/authenticators.py b/PFERD/ilias/authenticators.py index 5443d99..e1da49d 100644 --- a/PFERD/ilias/authenticators.py +++ b/PFERD/ilias/authenticators.py @@ -15,6 +15,9 @@ from ..utils import soupify LOGGER = logging.getLogger(__name__) +# TODO save cookies whenever we know they're good + + class IliasAuthenticator(abc.ABC): # pylint: disable=too-few-public-methods diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index eea5aa0..f6b98c9 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -7,11 +7,12 @@ import json import logging import re from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit) import bs4 +import requests from ..cookie_jar import CookieJar from ..utils import soupify @@ -22,23 +23,36 @@ from .downloader import IliasDownloadInfo LOGGER = logging.getLogger(__name__) +IliasFilter = Callable[[Path], bool] + + class IliasCrawler: # pylint: disable=too-few-public-methods + + # TODO use the filter as appropriate + # TODO log the things that were discovered to the console on INFO + """ A crawler for ILIAS. """ - def __init__(self, authenticator: IliasAuthenticator, base_url: str, course_id: str): + def __init__( + self, + base_url: str, + course_id: str, + session: requests.Session, + authenticator: IliasAuthenticator, + filter_: IliasFilter + ): """ Create a new ILIAS crawler. """ - self._cookie_jar = CookieJar(Path("/tmp/test/cookies")) - self._cookie_jar.load_cookies() self._base_url = base_url self._course_id = course_id - self._session = self._cookie_jar.create_session() + self._session = session self._authenticator = authenticator + self._filter = filter_ def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: """ @@ -342,8 +356,6 @@ class IliasCrawler: self._authenticator.authenticate(self._session) - self._cookie_jar.save_cookies("Authed") - return self._get_page(url, params) @staticmethod @@ -369,11 +381,3 @@ class IliasCrawler: LOGGER.debug("Auth: Found #playerContainer") return True return False - - -def run_as_test(ilias_url: str, course_id: int) -> List[IliasDownloadInfo]: - from ..organizer import Organizer - from .authenticators import KitShibbolethAuthenticator - - crawler = IliasCrawler(KitShibbolethAuthenticator(), ilias_url, str(course_id)) - return crawler.crawl() diff --git a/PFERD/ilias/downloader.py b/PFERD/ilias/downloader.py index 66243d1..88e6792 100644 --- a/PFERD/ilias/downloader.py +++ b/PFERD/ilias/downloader.py @@ -33,12 +33,21 @@ class IliasDownloadInfo(Transformable): class IliasDownloader: """A downloader for ILIAS.""" - def __init__(self, tmp_dir: TmpDir, organizer: Organizer, authenticator: IliasAuthenticator): - """Create a new IliasDownloader.""" - self._authenticator = authenticator - self._session = requests.Session() + def __init__( + self, + tmp_dir: TmpDir, + organizer: Organizer, + session: requests.Session, + authenticator: IliasAuthenticator, + ): + """ + Create a new IliasDownloader. + """ + self._tmp_dir = tmp_dir self._organizer = organizer + self._session = session + self._authenticator = authenticator def download_all(self, infos: List[IliasDownloadInfo]) -> None: """ @@ -55,7 +64,7 @@ class IliasDownloader: Retries authentication until eternity if it could not fetch the file. """ - tmp_file = self._tmp_dir.new_file() + tmp_file = self._tmp_dir.new_path() while not self._try_download(info, tmp_file): self._authenticator.authenticate(self._session) diff --git a/PFERD/pferd.py b/PFERD/pferd.py new file mode 100644 index 0000000..5fd39ef --- /dev/null +++ b/PFERD/pferd.py @@ -0,0 +1,65 @@ +from pathlib import Path +from typing import Optional + +from .cookie_jar import CookieJar +from .ilias import (IliasAuthenticator, IliasCrawler, IliasDownloader, + IliasFilter, KitShibbolethAuthenticator) +from .organizer import Organizer +from .tmp_dir import TmpDir +from .transform import Transform, apply_transform +from .utils import Location + + +class Pferd(Location): + # pylint: disable=too-many-arguments + + def __init__(self, base_dir: Path, tmp_dir: Path = Path(".tmp")): + super().__init__(Path(base_dir)) + + self._tmp_dir = TmpDir(self.resolve(tmp_dir)) + + def _ilias( + self, + target: Path, + base_url: str, + course_id: str, + authenticator: IliasAuthenticator, + cookies: Optional[Path], + filter_: IliasFilter, + transform: Transform, + ) -> None: + cookie_jar = CookieJar(cookies) + session = cookie_jar.create_session() + tmp_dir = self._tmp_dir.new_subdir() + organizer = Organizer(self.resolve(target)) + + crawler = IliasCrawler(base_url, course_id, session, authenticator, filter_) + downloader = IliasDownloader(tmp_dir, organizer, session, authenticator) + + cookie_jar.load_cookies() + info = crawler.crawl() + cookie_jar.save_cookies() + downloader.download_all(apply_transform(transform, info)) + cookie_jar.save_cookies() + + def ilias_kit( + self, + target: Path, + course_id: str, + filter_: IliasFilter = lambda x: True, + transform: Transform = lambda x: x, + cookies: Optional[Path] = None, + username: Optional[str] = None, + password: Optional[str] = None, + ) -> None: + # This authenticator only works with the KIT ilias instance. + authenticator = KitShibbolethAuthenticator(username=username, password=password) + self._ilias( + target=target, + base_url="https://ilias.studium.kit.edu/", + course_id=course_id, + authenticator=authenticator, + cookies=cookies, + filter_=filter_, + transform=transform, + ) diff --git a/PFERD/tmp_dir.py b/PFERD/tmp_dir.py index c528c30..850d886 100644 --- a/PFERD/tmp_dir.py +++ b/PFERD/tmp_dir.py @@ -39,18 +39,24 @@ class TmpDir(Location): self.cleanup() return None - def new_file(self, prefix: Optional[str] = None) -> Path: - """Return a unique path inside the folder, but don't create a file.""" + def new_path(self, prefix: Optional[str] = None) -> Path: + """ + Return a unique path inside the directory. Doesn't create a file or + directory. + """ + name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" LOGGER.debug("Creating temp file %s", name) return self.resolve(Path(name)) - def new_folder(self, prefix: Optional[str] = None) -> 'TmpDir': - """Create a new nested temporary folder and return its path.""" - name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" + def new_subdir(self, prefix: Optional[str] = None) -> 'TmpDir': + """ + Create a new nested temporary folder and return it. + """ + name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" sub_path = self.resolve(Path(name)) sub_path.mkdir(parents=True)