Rename the ilias crawler to ilias web crawler

2026-02-19 15:22:23 +01:00 · 2021-05-19 21:41:17 +02:00
parent 2976b4d352
commit 81301f3a76
3 changed files with 8 additions and 8 deletions
--- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py
@@ -0,0 +1,369 @@
+import asyncio
+import re
+from pathlib import PurePath
+# TODO In Python 3.9 and above, AsyncContextManager is deprecated
+from typing import Any, Dict, Optional, Set, Union
+
+import aiohttp
+from bs4 import BeautifulSoup, Tag
+
+from PFERD.authenticators import Authenticator
+from PFERD.config import Config
+from PFERD.crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat
+from PFERD.output_dir import Redownload
+from PFERD.utils import soupify, url_set_query_param
+
+from .file_templates import link_template_plain, link_template_rich
+from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
+
+TargetType = Union[str, int]
+
+
+class KitIliasWebCrawlerSection(CrawlerSection):
+
+    def target(self) -> TargetType:
+        target = self.s.get("target")
+        if not target:
+            self.missing_value("target")
+
+        if re.fullmatch(r"\d+", target):
+            # Course id
+            return int(target)
+        if target == "desktop":
+            # Full personal desktop
+            return target
+        if target.startswith("https://ilias.studium.kit.edu"):
+            # ILIAS URL
+            return target
+
+        self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
+
+    def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
+        value = self.s.get("tfa_auth")
+        if not value:
+            return None
+
+        auth = authenticators.get(f"auth:{value}")
+        if auth is None:
+            self.invalid_value("auth", value, "No such auth section exists")
+        return auth
+
+    def link_file_redirect_delay(self) -> int:
+        return self.s.getint("link_file_redirect_delay", fallback=-1)
+
+    def link_file_use_plaintext(self) -> bool:
+        return self.s.getboolean("link_file_plain_text", fallback=False)
+
+
+_DIRECTORY_PAGES: Set[IliasElementType] = set([
+    IliasElementType.EXERCISE,
+    IliasElementType.FOLDER,
+    IliasElementType.MEETING,
+    IliasElementType.VIDEO_FOLDER,
+    IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
+])
+
+
+class KitIliasWebCrawler(HttpCrawler):
+    def __init__(
+            self,
+            name: str,
+            section: KitIliasWebCrawlerSection,
+            config: Config,
+            authenticators: Dict[str, Authenticator]
+    ):
+        super().__init__(name, section, config)
+
+        self._shibboleth_login = KitShibbolethLogin(
+            section.auth(authenticators),
+            section.tfa_auth(authenticators)
+        )
+        self._base_url = "https://ilias.studium.kit.edu"
+
+        self._target = section.target()
+        self._link_file_redirect_delay = section.link_file_redirect_delay()
+        self._link_file_use_plaintext = section.link_file_use_plaintext()
+
+    async def crawl(self) -> None:
+        if isinstance(self._target, int):
+            await self._crawl_course(self._target)
+        elif self._target == "desktop":
+            await self._crawl_desktop()
+        else:
+            await self._crawl_url(self._target)
+
+        if self.error_free:
+            await self.cleanup()
+
+    async def _crawl_course(self, course_id: int) -> None:
+        # Start crawling at the given course
+        root_url = url_set_query_param(
+            self._base_url + "/goto.php", "target", f"crs_{course_id}"
+        )
+
+        await self._crawl_url(root_url, expected_id=course_id)
+
+    async def _crawl_desktop(self) -> None:
+        await self._crawl_url(self._base_url)
+
+    @arepeat(3)
+    async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
+        tasks = []
+
+        async with self.crawl_bar(PurePath("Root element")):
+            soup = await self._get_page(url)
+
+            if expected_id is not None:
+                perma_link_element: Tag = soup.find(id="current_perma_link")
+                if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+                    # TODO: Properly handle error
+                    raise RuntimeError(
+                        "Invalid course id? I didn't find anything looking like a course!")
+
+            # Duplicated code, but the root page is special - we want to void fetching it twice!
+            page = IliasPage(soup, url, None)
+            for child in page.get_child_elements():
+                tasks.append(self._handle_ilias_element(PurePath("."), child))
+
+        await asyncio.gather(*tasks)
+
+    @arepeat(3)
+    @anoncritical
+    async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
+        tasks = []
+        async with self.crawl_bar(path):
+            soup = await self._get_page(url)
+            page = IliasPage(soup, url, parent)
+
+            for child in page.get_child_elements():
+                tasks.append(self._handle_ilias_element(path, child))
+
+        await asyncio.gather(*tasks)
+
+    @anoncritical
+    async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
+        element_path = PurePath(parent_path, element.name)
+
+        if not self.should_crawl(element_path):
+            return
+
+        if element.type == IliasElementType.FILE:
+            await self._download_file(element, element_path)
+        elif element.type == IliasElementType.FORUM:
+            # TODO: Delete
+            print(f"Skipping forum [green]{element_path}[/]")
+        elif element.type == IliasElementType.LINK:
+            await self._download_link(element, element_path)
+        elif element.type == IliasElementType.VIDEO:
+            await self._download_file(element, element_path)
+        elif element.type == IliasElementType.VIDEO_PLAYER:
+            await self._download_video(element, element_path)
+        elif element.type in _DIRECTORY_PAGES:
+            await self._handle_ilias_page(element.url, element, element_path)
+        else:
+            # TODO: Proper exception
+            raise RuntimeError(f"Unknown type: {element.type!r}")
+
+    @arepeat(3)
+    async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
+        dl = await self.download(element_path, mtime=element.mtime)
+        if not dl:
+            return
+
+        async with self.download_bar(element_path):
+            export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
+            async with self.session.get(export_url) as response:
+                html_page: BeautifulSoup = soupify(await response.read())
+                real_url: str = html_page.select_one("a").get("href").strip()
+
+            async with dl as sink:
+                content = link_template_plain if self._link_file_use_plaintext else link_template_rich
+                content = content.replace("{{link}}", real_url)
+                content = content.replace("{{name}}", element.name)
+                content = content.replace("{{description}}", str(element.description))
+                content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
+                sink.file.write(content.encode("utf-8"))
+                sink.done()
+
+    @arepeat(3)
+    async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
+        # Videos will NOT be redownloaded - their content doesn't really change and they are chunky
+        dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER)
+        if not dl:
+            return
+
+        async with self.download_bar(element_path) as bar:
+            page = IliasPage(await self._get_page(element.url), element.url, element)
+            real_element = page.get_child_elements()[0]
+
+            async with dl as sink, self.session.get(real_element.url) as resp:
+                if resp.content_length:
+                    bar.set_total(resp.content_length)
+
+                async for data in resp.content.iter_chunked(1024):
+                    sink.file.write(data)
+                    bar.advance(len(data))
+
+                sink.done()
+
+    @arepeat(3)
+    async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
+        dl = await self.download(element_path, mtime=element.mtime)
+        if not dl:
+            return
+
+        async with self.download_bar(element_path) as bar:
+            async with dl as sink, self.session.get(element.url) as resp:
+                if resp.content_length:
+                    bar.set_total(resp.content_length)
+
+                async for data in resp.content.iter_chunked(1024):
+                    sink.file.write(data)
+                    bar.advance(len(data))
+
+                sink.done()
+
+    async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
+        # This function will retry itself a few times if it is not logged in - it won't handle
+        # connection errors
+        if retries_left < 0:
+            # TODO: Proper exception
+            raise RuntimeError("Get page failed too often")
+        print(url, "retries left", retries_left)
+        async with self.session.get(url) as request:
+            soup = soupify(await request.read())
+            if self._is_logged_in(soup):
+                return soup
+
+        await self._shibboleth_login.login(self.session)
+
+        return await self._get_page(url, retries_left - 1)
+
+    @staticmethod
+    def _is_logged_in(soup: BeautifulSoup) -> bool:
+        # Normal ILIAS pages
+        userlog = soup.find("li", {"id": "userlog"})
+        if userlog is not None:
+            return True
+        # Video listing embeds do not have complete ILIAS html. Try to match them by
+        # their video listing table
+        video_table = soup.find(
+            recursive=True,
+            name="table",
+            attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
+        )
+        if video_table is not None:
+            return True
+        # The individual video player wrapper page has nothing of the above.
+        # Match it by its playerContainer.
+        if soup.select_one("#playerContainer") is not None:
+            return True
+        return False
+
+
+class KitShibbolethLogin:
+    """
+    Login via KIT's shibboleth system.
+    """
+
+    def __init__(self, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator]) -> None:
+        self._auth = authenticator
+        self._tfa_auth = tfa_authenticator
+
+    async def login(self, sess: aiohttp.ClientSession) -> None:
+        """
+        Performs the ILIAS Shibboleth authentication dance and saves the login
+        cookies it receieves.
+
+        This function should only be called whenever it is detected that you're
+        not logged in. The cookies obtained should be good for a few minutes,
+        maybe even an hour or two.
+        """
+
+        # Equivalent: Click on "Mit KIT-Account anmelden" button in
+        # https://ilias.studium.kit.edu/login.php
+        url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
+        data = {
+            "sendLogin": "1",
+            "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
+            "target": "/shib_login.php",
+            "home_organization_selection": "Mit KIT-Account anmelden",
+        }
+        soup: BeautifulSoup = await _post(sess, url, data)
+
+        # Attempt to login using credentials, if necessary
+        while not self._login_successful(soup):
+            # Searching the form here so that this fails before asking for
+            # credentials rather than after asking.
+            form = soup.find("form", {"class": "full content", "method": "post"})
+            action = form["action"]
+
+            csrf_token = form.find("input", {"name": "csrf_token"})["value"]
+
+            # Equivalent: Enter credentials in
+            # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
+            url = "https://idp.scc.kit.edu" + action
+            username, password = await self._auth.credentials()
+            data = {
+                "_eventId_proceed": "",
+                "j_username": username,
+                "j_password": password,
+                "csrf_token": csrf_token
+            }
+            soup = await _post(sess, url, data)
+
+            if self._tfa_required(soup):
+                soup = await self._authenticate_tfa(sess, soup)
+
+            if not self._login_successful(soup):
+                self._auth.invalidate_credentials()
+
+        # Equivalent: Being redirected via JS automatically
+        # (or clicking "Continue" if you have JS disabled)
+        relay_state = soup.find("input", {"name": "RelayState"})
+        saml_response = soup.find("input", {"name": "SAMLResponse"})
+        url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
+        data = {  # using the info obtained in the while loop above
+            "RelayState": relay_state["value"],
+            "SAMLResponse": saml_response["value"],
+        }
+        await sess.post(url, data=data)
+
+    async def _authenticate_tfa(
+            self,
+            session: aiohttp.ClientSession,
+            soup: BeautifulSoup
+    ) -> BeautifulSoup:
+        if not self._tfa_auth:
+            raise RuntimeError("No 'tfa_auth' present but you use two-factor authentication!")
+
+        tfa_token = await self._tfa_auth.password()
+
+        # Searching the form here so that this fails before asking for
+        # credentials rather than after asking.
+        form = soup.find("form", {"method": "post"})
+        action = form["action"]
+
+        # Equivalent: Enter token in
+        # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
+        url = "https://idp.scc.kit.edu" + action
+        data = {
+            "_eventId_proceed": "",
+            "j_tokenNumber": tfa_token
+        }
+        return _post(session, url, data)
+
+    @staticmethod
+    def _login_successful(soup: BeautifulSoup) -> bool:
+        relay_state = soup.find("input", {"name": "RelayState"})
+        saml_response = soup.find("input", {"name": "SAMLResponse"})
+        return relay_state is not None and saml_response is not None
+
+    @staticmethod
+    def _tfa_required(soup: BeautifulSoup) -> bool:
+        return soup.find(id="j_tokenNumber") is not None
+
+
+async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
+    async with session.post(url, data=data) as response:
+        return soupify(await response.read())