Start implementing crawling in ILIAS crawler

The ilias crawler can now crawl quite a few filetypes, splits off
folders and crawls them concurrently.
This commit is contained in:
I-Al-Istannen 2021-05-15 20:42:18 +02:00
parent 1123c8884d
commit c7494e32ce

View File

@ -1,3 +1,4 @@
import asyncio
import json import json
import re import re
from configparser import SectionProxy from configparser import SectionProxy
@ -5,8 +6,9 @@ from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from pathlib import PurePath from pathlib import PurePath
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional, Set, Union
from urllib.parse import urljoin, urlparse from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
urlunsplit)
import aiohttp import aiohttp
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -18,23 +20,27 @@ from ..config import Config
from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical, from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical,
arepeat) arepeat)
TargetType = Union[str, int]
class IliasCrawlerSection(CrawlerSection): class IliasCrawlerSection(CrawlerSection):
def __init__(self, section: SectionProxy): def target(self) -> TargetType:
super().__init__(section) target = self.s.get("target")
if not target:
self.missing_value("target")
if not self.course_id() and not self.element_url(): if re.fullmatch(r"\d+", target):
self.missing_value("course_id or element_url") # Course id
return int(target)
if target == "desktop":
# Full personal desktop
return target
if target.startswith("https://ilias.studium.kit.edu"):
# ILIAS URL
return target
def course_id(self) -> Optional[str]: self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
return self.s.get("course_id")
def element_url(self) -> Optional[str]:
return self.s.get("element_url")
def base_url(self) -> str:
return self.s.get("ilias_url", "https://ilias.studium.kit.edu/")
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
value = self.s.get("tfa_auth") value = self.s.get("tfa_auth")
@ -66,7 +72,6 @@ class IliasPageElement:
url: str url: str
name: str name: str
mtime: Optional[datetime] = None mtime: Optional[datetime] = None
query_parameter: Dict[str, str] = field(default_factory=dict)
class IliasPage: class IliasPage:
@ -91,11 +96,17 @@ class IliasPage:
return "paella_config_file" in str(self._soup) return "paella_config_file" in str(self._soup)
def _is_video_listing(self) -> bool: def _is_video_listing(self) -> bool:
# ILIAS fluff around it
if self._soup.find(id="headerimage"): if self._soup.find(id="headerimage"):
element: Tag = self._soup.find(id="headerimage") element: Tag = self._soup.find(id="headerimage")
if "opencast" in element.attrs["src"].lower(): if "opencast" in element.attrs["src"].lower():
return True return True
return False
# Raw listing without ILIAS fluff
video_element_table: Tag = self._soup.find(
name="table", id=re.compile(r"tbl_xoct_.+")
)
return video_element_table is not None
def _player_to_video(self) -> List[IliasPageElement]: def _player_to_video(self) -> List[IliasPageElement]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript # Fetch the actual video page. This is a small wrapper page initializing a javscript
@ -137,9 +148,8 @@ class IliasPage:
content_link: Tag = self._soup.select_one("#tab_series a") content_link: Tag = self._soup.select_one("#tab_series a")
url: str = self._abs_url_from_link(content_link) url: str = self._abs_url_from_link(content_link)
query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
return [IliasPageElement( url = _url_set_query_params(url, query_params)
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "", query_parameter=query_params return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")]
)]
is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
@ -173,9 +183,8 @@ class IliasPage:
query_params = {f"tbl_xoct_{table_id}_trows": "800", query_params = {f"tbl_xoct_{table_id}_trows": "800",
"cmd": "asyncGetTableGUI", "cmdMode": "asynch"} "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
return [IliasPageElement( url = _url_set_query_params(self._page_url, query_params)
IliasElementType.VIDEO_FOLDER, self._page_url, "", query_parameter=query_params return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")]
)]
def _find_video_entries_no_paging(self) -> List[IliasPageElement]: def _find_video_entries_no_paging(self) -> List[IliasPageElement]:
""" """
@ -363,6 +372,7 @@ class IliasPage:
""" """
return urljoin(self._page_url, link_tag.get("href")) return urljoin(self._page_url, link_tag.get("href"))
def demangle_date(date_str: str) -> Optional[datetime]: def demangle_date(date_str: str) -> Optional[datetime]:
return None return None
@ -371,6 +381,36 @@ def _sanitize_path_name(name: str) -> str:
return name.replace("/", "-").replace("\\", "-").strip() return name.replace("/", "-").replace("\\", "-").strip()
def _url_set_query_param(url: str, param: str, value: str) -> str:
"""
Set a query parameter in an url, overwriting existing ones with the same name.
"""
scheme, netloc, path, query, fragment = urlsplit(url)
query_parameters = parse_qs(query)
query_parameters[param] = [value]
new_query_string = urlencode(query_parameters, doseq=True)
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
def _url_set_query_params(url: str, params: Dict[str, str]) -> str:
result = url
for key, val in params.items():
result = _url_set_query_param(result, key, val)
return result
_DIRECTORY_PAGES: Set[IliasElementType] = set([
IliasElementType.EXERCISE,
IliasElementType.FOLDER,
IliasElementType.MEETING,
IliasElementType.VIDEO_FOLDER,
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
])
class IliasCrawler(HttpCrawler): class IliasCrawler(HttpCrawler):
def __init__( def __init__(
self, self,
@ -386,22 +426,104 @@ class IliasCrawler(HttpCrawler):
section.auth(authenticators), section.auth(authenticators),
section.tfa_auth(authenticators) section.tfa_auth(authenticators)
) )
self._base_url = section.base_url() self._base_url = "https://ilias.studium.kit.edu"
self._course_id = section.course_id() self._target = section.target()
self._element_url = section.element_url()
async def crawl(self) -> None: async def crawl(self) -> None:
async with self.crawl_bar(PurePath("/")) as bar: if isinstance(self._target, int):
soup = await self._get_page(self._base_url) await self._crawl_course(self._target)
page = IliasPage(soup, self._base_url, None) elif self._target == "desktop":
for element in page.get_child_elements(): await self._crawl_desktop()
self.print(element.name + " " + str(element.type)) else:
await self._crawl_url(self._target)
async def _crawl_course(self, course_id: int) -> None:
# Start crawling at the given course
root_url = _url_set_query_param(
self._base_url + "/goto.php", "target", f"crs_{course_id}"
)
await self._crawl_url(root_url, expected_id=course_id)
async def _crawl_desktop(self) -> None:
await self._crawl_url(self._base_url)
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
tasks = []
async with self.crawl_bar(PurePath("Root element")):
soup = await self._get_page(url)
if expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
# TODO: Properly handle error
raise RuntimeError(
"Invalid course id? I didn't find anything looking like a course!")
# Duplicated code, but the root page is special - we want to void fetching it twice!
page = IliasPage(soup, url, None)
for child in page.get_child_elements():
tasks.append(self._handle_ilias_element(PurePath("."), child))
await asyncio.gather(*tasks)
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
tasks = []
async with self.crawl_bar(path):
soup = await self._get_page(url)
page = IliasPage(soup, url, parent)
for child in page.get_child_elements():
tasks.append(self._handle_ilias_element(path, child))
await asyncio.gather(*tasks)
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
element_path = PurePath(parent_path, element.name)
if element.type == IliasElementType.FILE:
await self._download_element(element, element_path)
elif element.type == IliasElementType.FORUM:
# TODO: Delete
self.print(f"Skipping forum [green]{element_path}[/]")
elif element.type == IliasElementType.LINK:
# TODO: Write in meta-redirect file
self.print(f"Skipping link [green]{element_path}[/]")
elif element.type == IliasElementType.VIDEO:
await self._download_element(element, element_path)
elif element.type == IliasElementType.VIDEO_PLAYER:
# FIXME: Check if we should look at this and if not bail out already!
# This saves us a request for each video, if we skip them anyways
raise RuntimeError("IMPLEMENT ME")
elif element.type in _DIRECTORY_PAGES:
await self._handle_ilias_page(element.url, element, element_path)
else:
# TODO: Proper exception
raise RuntimeError(f"Unknown type: {element.type!r}")
async def _download_element(self, element: IliasPageElement, element_path: PurePath) -> None:
dl = await self.download(element_path, mtime=element.mtime)
if not dl:
return
async with self.download_bar(element_path) as bar, dl as sink,\
self.session.get(element.url) as resp:
if resp.content_length:
bar.set_total(resp.content_length)
async for data in resp.content.iter_chunked(1024):
sink.file.write(data)
bar.advance(len(data))
sink.done()
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
if retries_left < 0: if retries_left < 0:
# TODO: Proper exception # TODO: Proper exception
raise RuntimeError("Get page failed too often") raise RuntimeError("Get page failed too often")
print(url)
async with self.session.get(url) as request: async with self.session.get(url) as request:
soup = soupify(await request.read()) soup = soupify(await request.read())
if self._is_logged_in(soup): if self._is_logged_in(soup):