From db1219d4a9cd8bb0522803c84e7f1e6203a6b262 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 17 May 2021 21:31:22 +0200 Subject: [PATCH] Create a link file in ILIAS crawler This allows us to crawl links and represent them in the file system. Users can choose between an ILIAS-imitation (that optionally auto-redirects) and a plain text variant. --- CONFIG.md | 6 ++ PFERD/crawlers/ilias.py | 139 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 141 insertions(+), 4 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index bd24b16..6149ef5 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -128,6 +128,12 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S - `` if you want to crawl the course with the given id - `` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page) - `tfa_auth`: Like `auth` but only used for two-factor authentication +- `link_file_redirect_delay`: PFERD will create local HTML for external links. + If this property is set to a non-negative value it configures the amount of seconds after which the local HTML + file will redirect you to the link target. +- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link + target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional + HTML page instead. ## Authenticator types ### The `simple` authenticator diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 09bad09..4d81976 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -52,6 +52,12 @@ class KitIliasCrawlerSection(CrawlerSection): self.invalid_value("auth", value, "No such auth section exists") return auth + def link_file_redirect_delay(self) -> int: + return self.s.getint("link_file_redirect_delay", fallback=-1) + + def link_file_use_plaintext(self) -> bool: + return self.s.getboolean("link_file_plain_text", fallback=False) + class IliasElementType(Enum): EXERCISE = "exercise" @@ -72,6 +78,7 @@ class IliasPageElement: url: str name: str mtime: Optional[datetime] = None + description: Optional[str] = None class IliasPage: @@ -279,6 +286,7 @@ class IliasPage: abs_url = self._abs_url_from_link(link) element_name = _sanitize_path_name(link.getText()) element_type = self._find_type_from_link(element_name, link, abs_url) + description = self._find_link_description(link) if not element_type: continue @@ -288,10 +296,19 @@ class IliasPage: result.append(self._file_to_element(element_name, abs_url, link)) continue - result.append(IliasPageElement(element_type, abs_url, element_name, None)) + result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) return result + def _find_link_description(self, link: Tag) -> Optional[str]: + tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) + if not tile: + return None + description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x}) + if not description_element: + return None + return description_element.getText().strip() + def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: # Files have a list of properties (type, modification date, size, etc.) # In a series of divs. @@ -528,6 +545,8 @@ class KitIliasCrawler(HttpCrawler): self._base_url = "https://ilias.studium.kit.edu" self._target = section.target() + self._link_file_redirect_delay = section.link_file_redirect_delay() + self._link_file_use_plaintext = section.link_file_use_plaintext() async def crawl(self) -> None: if isinstance(self._target, int): @@ -598,8 +617,7 @@ class KitIliasCrawler(HttpCrawler): # TODO: Delete self.print(f"Skipping forum [green]{element_path}[/]") elif element.type == IliasElementType.LINK: - # TODO: Write in meta-redirect file - self.print(f"Skipping link [green]{element_path}[/]") + await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: await self._download_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: @@ -610,6 +628,30 @@ class KitIliasCrawler(HttpCrawler): # TODO: Proper exception raise RuntimeError(f"Unknown type: {element.type!r}") + @arepeat(3) + async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: + dl = await self.download(element_path, mtime=element.mtime) + if not dl: + return + + async with self.download_bar(element_path, 2) as bar: + export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") + async with self.session.get(export_url) as response: + html_page: BeautifulSoup = soupify(await response.read()) + real_url: str = html_page.select_one("a").get("href").strip() + + bar.advance(1) + + async with dl as sink: + content = _link_template_plain if self._link_file_use_plaintext else _link_template_rich + content = content.replace("{{link}}", real_url) + content = content.replace("{{name}}", element.name) + content = content.replace("{{description}}", str(element.description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + bar.advance(1) + sink.done() + @arepeat(3) async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky @@ -654,7 +696,7 @@ class KitIliasCrawler(HttpCrawler): if retries_left < 0: # TODO: Proper exception raise RuntimeError("Get page failed too often") - print(url) + print(url, "retries left", retries_left) async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): @@ -792,3 +834,92 @@ class KitShibbolethLogin: async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: async with session.post(url, data=data) as response: return soupify(await response.read()) + +_link_template_plain = "{{link}}" +# flake8: noqa E501 +_link_template_rich = """ + + + + + ILIAS - Link: {{ name}} + + + + + +
+ +
+
+ {{name}} +
+
{{description}}
+
+ +
+ + +"""