From db1219d4a9cd8bb0522803c84e7f1e6203a6b262 Mon Sep 17 00:00:00 2001
From: I-Al-Istannen <i-al-istannen@users.noreply.github.com>
Date: Mon, 17 May 2021 21:31:22 +0200
Subject: [PATCH] Create a link file in ILIAS crawler

This allows us to crawl links and represent them in the file system.
Users can choose between an ILIAS-imitation (that optionally
auto-redirects) and a plain text variant.
---
 CONFIG.md               |   6 ++
 PFERD/crawlers/ilias.py | 139 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 141 insertions(+), 4 deletions(-)
diff --git a/CONFIG.md b/CONFIG.md
index bd24b16..6149ef5 100644
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -128,6 +128,12 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S
   - `<course id>` if you want to crawl the course with the given id
   - `<url>` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page)
 - `tfa_auth`: Like `auth` but only used for two-factor authentication
+- `link_file_redirect_delay`: PFERD will create local HTML for external links. 
+   If this property is set to a non-negative value it configures the amount of seconds after which the local HTML
+   file will redirect you to the link target.
+- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link
+   target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional 
+   HTML page instead.
 ## Authenticator types
 
 ### The `simple` authenticator
diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py
index 09bad09..4d81976 100644
--- a/PFERD/crawlers/ilias.py
+++ b/PFERD/crawlers/ilias.py
@@ -52,6 +52,12 @@ class KitIliasCrawlerSection(CrawlerSection):
             self.invalid_value("auth", value, "No such auth section exists")
         return auth
 
+    def link_file_redirect_delay(self) -> int:
+        return self.s.getint("link_file_redirect_delay", fallback=-1)
+
+    def link_file_use_plaintext(self) -> bool:
+        return self.s.getboolean("link_file_plain_text", fallback=False)
+
 
 class IliasElementType(Enum):
     EXERCISE = "exercise"
@@ -72,6 +78,7 @@ class IliasPageElement:
     url: str
     name: str
     mtime: Optional[datetime] = None
+    description: Optional[str] = None
 
 
 class IliasPage:
@@ -279,6 +286,7 @@ class IliasPage:
             abs_url = self._abs_url_from_link(link)
             element_name = _sanitize_path_name(link.getText())
             element_type = self._find_type_from_link(element_name, link, abs_url)
+            description = self._find_link_description(link)
 
             if not element_type:
                 continue
@@ -288,10 +296,19 @@ class IliasPage:
                 result.append(self._file_to_element(element_name, abs_url, link))
                 continue
 
-            result.append(IliasPageElement(element_type, abs_url, element_name, None))
+            result.append(IliasPageElement(element_type, abs_url, element_name, description=description))
 
         return result
 
+    def _find_link_description(self, link: Tag) -> Optional[str]:
+        tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x})
+        if not tile:
+            return None
+        description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x})
+        if not description_element:
+            return None
+        return description_element.getText().strip()
+
     def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement:
         # Files have a list of properties (type, modification date, size, etc.)
         # In a series of divs.
@@ -528,6 +545,8 @@ class KitIliasCrawler(HttpCrawler):
         self._base_url = "https://ilias.studium.kit.edu"
 
         self._target = section.target()
+        self._link_file_redirect_delay = section.link_file_redirect_delay()
+        self._link_file_use_plaintext = section.link_file_use_plaintext()
 
     async def crawl(self) -> None:
         if isinstance(self._target, int):
@@ -598,8 +617,7 @@ class KitIliasCrawler(HttpCrawler):
             # TODO: Delete
             self.print(f"Skipping forum [green]{element_path}[/]")
         elif element.type == IliasElementType.LINK:
-            # TODO: Write in meta-redirect file
-            self.print(f"Skipping link [green]{element_path}[/]")
+            await self._download_link(element, element_path)
         elif element.type == IliasElementType.VIDEO:
             await self._download_file(element, element_path)
         elif element.type == IliasElementType.VIDEO_PLAYER:
@@ -610,6 +628,30 @@ class KitIliasCrawler(HttpCrawler):
             # TODO: Proper exception
             raise RuntimeError(f"Unknown type: {element.type!r}")
 
+    @arepeat(3)
+    async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
+        dl = await self.download(element_path, mtime=element.mtime)
+        if not dl:
+            return
+
+        async with self.download_bar(element_path, 2) as bar:
+            export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
+            async with self.session.get(export_url) as response:
+                html_page: BeautifulSoup = soupify(await response.read())
+                real_url: str = html_page.select_one("a").get("href").strip()
+
+            bar.advance(1)
+
+            async with dl as sink:
+                content = _link_template_plain if self._link_file_use_plaintext else _link_template_rich
+                content = content.replace("{{link}}", real_url)
+                content = content.replace("{{name}}", element.name)
+                content = content.replace("{{description}}", str(element.description))
+                content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
+                sink.file.write(content.encode("utf-8"))
+                bar.advance(1)
+                sink.done()
+
     @arepeat(3)
     async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
         # Videos will NOT be redownloaded - their content doesn't really change and they are chunky
@@ -654,7 +696,7 @@ class KitIliasCrawler(HttpCrawler):
         if retries_left < 0:
             # TODO: Proper exception
             raise RuntimeError("Get page failed too often")
-        print(url)
+        print(url, "retries left", retries_left)
         async with self.session.get(url) as request:
             soup = soupify(await request.read())
             if self._is_logged_in(soup):
@@ -792,3 +834,92 @@ class KitShibbolethLogin:
 async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
     async with session.post(url, data=data) as response:
         return soupify(await response.read())
+
+_link_template_plain = "{{link}}"
+# flake8: noqa E501
+_link_template_rich = """
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <title>ILIAS - Link: {{ name}}</title>
+        <meta http-equiv = "refresh" content = "{{redirect_delay}}; url = {{link}}" />
+    </head>
+
+    <style>
+    * {
+        box-sizing: border-box;
+    }
+    .center-flex {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+    }
+    body {
+        padding: 0;
+        margin: 0;
+        background-color: #f0f0f0;
+        font-family: "Open Sans", Verdana, Arial, Helvetica, sans-serif;
+        height: 100vh;
+    }
+    .row {
+        background-color: white;
+        min-width: 500px;
+        max-width: 90vw;
+        display: flex;
+        padding: 1em;
+    }
+    .logo {
+        flex: 0 1;
+        margin-right: 1em;
+        fill: #009682;
+    }
+    .tile {
+        flex: 1 0;
+        display: flex;
+        flex-direction: column;
+        justify-content: center;
+    }
+    .top-row {
+        padding-bottom: 5px;
+        font-size: 15px;
+    }
+    a {
+        color: #009682;
+        text-decoration: none;
+    }
+    a:hover {
+        text-decoration: underline;
+    }
+    .bottom-row {
+        font-size: 13px;
+    }
+    .menu-button {
+        border: 1px solid black;
+        margin-left: 4em;
+        width: 25px;
+        height: 25px;
+        flex: 0 0 25px;
+        background-color: #b3e0da;
+        font-size: 13px;
+        color: #222;
+    }
+    </style>
+    <body class="center-flex">
+        <div class="row">
+            <div class="logo center-flex">
+                <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
+                    <path d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm9.567 9.098c-.059-.058-.127-.108-.206-.138-.258-.101-1.35.603-1.515.256-.108-.231-.327.148-.578.008-.121-.067-.459-.52-.611-.465-.312.112.479.974.694 1.087.203-.154.86-.469 1.002-.039.271.812-.745 1.702-1.264 2.171-.775.702-.63-.454-1.159-.86-.277-.213-.274-.667-.555-.824-.125-.071-.7-.732-.694-.821l-.017.167c-.095.072-.297-.27-.319-.325 0 .298.485.772.646 1.011.273.409.42 1.005.756 1.339.179.18.866.923 1.045.908l.921-.437c.649.154-1.531 3.237-1.738 3.619-.171.321.139 1.112.114 1.49-.029.437-.374.579-.7.817-.35.255-.268.752-.562.934-.521.321-.897 1.366-1.639 1.361-.219-.001-1.151.364-1.273.007-.095-.258-.223-.455-.356-.71-.131-.25-.015-.51-.175-.731-.11-.154-.479-.502-.513-.684-.002-.157.118-.632.283-.715.231-.118.044-.462.016-.663-.048-.357-.27-.652-.535-.859-.393-.302-.189-.542-.098-.974 0-.206-.126-.476-.402-.396-.57.166-.396-.445-.812-.417-.299.021-.543.211-.821.295-.349.104-.707-.083-1.053-.126-1.421-.179-1.885-1.804-1.514-2.976.037-.192-.115-.547-.048-.696.159-.352.485-.752.768-1.021.16-.152.365-.113.553-.231.29-.182.294-.558.578-.789.404-.328.956-.321 1.482-.392.281-.037 1.35-.268 1.518-.06 0 .039.193.611-.019.578.438.023 1.061.756 1.476.585.213-.089.135-.744.573-.427.265.19 1.45.275 1.696.07.152-.125.236-.939.053-1.031.117.116-.618.125-.686.099-.122-.044-.235.115-.43.025.117.055-.651-.358-.22-.674-.181.132-.349-.037-.544.109-.135.109.062.181-.13.277-.305.155-.535-.53-.649-.607-.118-.077-1.024-.713-.777-.298l.797.793c-.04.026-.209-.289-.209-.059.053-.136.02.585-.105.35-.056-.09.091-.14.006-.271 0-.085-.23-.169-.275-.228-.126-.157-.462-.502-.644-.585-.05-.024-.771.088-.832.111-.071.099-.131.203-.181.314-.149.055-.29.127-.423.216l-.159.356c-.068.061-.772.294-.776.303.03-.076-.492-.172-.457-.324.038-.167.215-.687.169-.877-.048-.199 1.085.287 1.158-.238.029-.227.047-.492-.316-.531.069.008.702-.249.807-.364.148-.169.486-.447.731-.447.286 0 .225-.417.356-.622.133.053-.071.38.088.512-.01-.104.45.057.494.033.105-.056.691-.023.601-.299-.101-.28.052-.197.183-.255-.02.008.248-.458.363-.456-.104-.089-.398.112-.516.103-.308-.024-.177-.525-.061-.672.09-.116-.246-.258-.25-.036-.006.332-.314.633-.243 1.075.109.666-.743-.161-.816-.115-.283.172-.515-.216-.368-.449.149-.238.51-.226.659-.48.104-.179.227-.389.388-.524.541-.454.689-.091 1.229-.042.526.048.178.125.105.327-.07.192.289.261.413.1.071-.092.232-.326.301-.499.07-.175.578-.2.527-.365 2.72 1.148 4.827 3.465 5.694 6.318zm-11.113-3.779l.068-.087.073-.019c.042-.034.086-.118.151-.104.043.009.146.095.111.148-.037.054-.066-.049-.081.101-.018.169-.188.167-.313.222-.087.037-.175-.018-.09-.104l.088-.108-.007-.049zm.442.245c.046-.045.138-.008.151-.094.014-.084.078-.178-.008-.335-.022-.042.116-.082.051-.137l-.109.032s.155-.668.364-.366l-.089.103c.135.134.172.47.215.687.127.066.324.078.098.192.117-.02-.618.314-.715.178-.072-.083.317-.139.307-.173-.004-.011-.317-.02-.265-.087zm1.43-3.547l-.356.326c-.36.298-1.28.883-1.793.705-.524-.18-1.647.667-1.826.673-.067.003.002-.641.36-.689-.141.021.993-.575 1.185-.805.678-.146 1.381-.227 2.104-.227l.326.017zm-5.086 1.19c.07.082.278.092-.026.288-.183.11-.377.809-.548.809-.51.223-.542-.439-1.109.413-.078.115-.395.158-.644.236.685-.688 1.468-1.279 2.327-1.746zm-5.24 8.793c0-.541.055-1.068.139-1.586l.292.185c.113.135.113.719.169.911.139.482.484.751.748 1.19.155.261.414.923.332 1.197.109-.179 1.081.824 1.259 1.033.418.492.74 1.088.061 1.574-.219.158.334 1.14.049 1.382l-.365.094c-.225.138-.235.397-.166.631-1.562-1.765-2.518-4.076-2.518-6.611zm14.347-5.823c.083-.01-.107.167-.107.167.033.256.222.396.581.527.437.157.038.455-.213.385-.139-.039-.854-.255-.879.025 0 .167-.679.001-.573-.175.073-.119.05-.387.186-.562.193-.255.38-.116.386.032-.001.394.398-.373.619-.399z"/>
+                </svg>
+            </div>
+            <div class="tile">
+                <div class="top-row">
+                    <a href="{{link}}">{{name}}</a>
+                </div>
+                <div class="bottom-row">{{description}}</div>
+            </div>
+            <div class="menu-button center-flex"> ⯆ </div>
+        </div>
+    </body>
+</html>
+"""