mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Start implementing crawling in ILIAS crawler
The ilias crawler can now crawl quite a few filetypes, splits off folders and crawls them concurrently.
This commit is contained in:
parent
1123c8884d
commit
c7494e32ce
@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from configparser import SectionProxy
|
from configparser import SectionProxy
|
||||||
@ -5,8 +6,9 @@ from dataclasses import dataclass, field
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional, Set, Union
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
|
||||||
|
urlunsplit)
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -18,23 +20,27 @@ from ..config import Config
|
|||||||
from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical,
|
from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical,
|
||||||
arepeat)
|
arepeat)
|
||||||
|
|
||||||
|
TargetType = Union[str, int]
|
||||||
|
|
||||||
|
|
||||||
class IliasCrawlerSection(CrawlerSection):
|
class IliasCrawlerSection(CrawlerSection):
|
||||||
|
|
||||||
def __init__(self, section: SectionProxy):
|
def target(self) -> TargetType:
|
||||||
super().__init__(section)
|
target = self.s.get("target")
|
||||||
|
if not target:
|
||||||
|
self.missing_value("target")
|
||||||
|
|
||||||
if not self.course_id() and not self.element_url():
|
if re.fullmatch(r"\d+", target):
|
||||||
self.missing_value("course_id or element_url")
|
# Course id
|
||||||
|
return int(target)
|
||||||
|
if target == "desktop":
|
||||||
|
# Full personal desktop
|
||||||
|
return target
|
||||||
|
if target.startswith("https://ilias.studium.kit.edu"):
|
||||||
|
# ILIAS URL
|
||||||
|
return target
|
||||||
|
|
||||||
def course_id(self) -> Optional[str]:
|
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
|
||||||
return self.s.get("course_id")
|
|
||||||
|
|
||||||
def element_url(self) -> Optional[str]:
|
|
||||||
return self.s.get("element_url")
|
|
||||||
|
|
||||||
def base_url(self) -> str:
|
|
||||||
return self.s.get("ilias_url", "https://ilias.studium.kit.edu/")
|
|
||||||
|
|
||||||
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||||
value = self.s.get("tfa_auth")
|
value = self.s.get("tfa_auth")
|
||||||
@ -66,7 +72,6 @@ class IliasPageElement:
|
|||||||
url: str
|
url: str
|
||||||
name: str
|
name: str
|
||||||
mtime: Optional[datetime] = None
|
mtime: Optional[datetime] = None
|
||||||
query_parameter: Dict[str, str] = field(default_factory=dict)
|
|
||||||
|
|
||||||
|
|
||||||
class IliasPage:
|
class IliasPage:
|
||||||
@ -91,11 +96,17 @@ class IliasPage:
|
|||||||
return "paella_config_file" in str(self._soup)
|
return "paella_config_file" in str(self._soup)
|
||||||
|
|
||||||
def _is_video_listing(self) -> bool:
|
def _is_video_listing(self) -> bool:
|
||||||
|
# ILIAS fluff around it
|
||||||
if self._soup.find(id="headerimage"):
|
if self._soup.find(id="headerimage"):
|
||||||
element: Tag = self._soup.find(id="headerimage")
|
element: Tag = self._soup.find(id="headerimage")
|
||||||
if "opencast" in element.attrs["src"].lower():
|
if "opencast" in element.attrs["src"].lower():
|
||||||
return True
|
return True
|
||||||
return False
|
|
||||||
|
# Raw listing without ILIAS fluff
|
||||||
|
video_element_table: Tag = self._soup.find(
|
||||||
|
name="table", id=re.compile(r"tbl_xoct_.+")
|
||||||
|
)
|
||||||
|
return video_element_table is not None
|
||||||
|
|
||||||
def _player_to_video(self) -> List[IliasPageElement]:
|
def _player_to_video(self) -> List[IliasPageElement]:
|
||||||
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
||||||
@ -137,9 +148,8 @@ class IliasPage:
|
|||||||
content_link: Tag = self._soup.select_one("#tab_series a")
|
content_link: Tag = self._soup.select_one("#tab_series a")
|
||||||
url: str = self._abs_url_from_link(content_link)
|
url: str = self._abs_url_from_link(content_link)
|
||||||
query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||||
return [IliasPageElement(
|
url = _url_set_query_params(url, query_params)
|
||||||
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "", query_parameter=query_params
|
return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")]
|
||||||
)]
|
|
||||||
|
|
||||||
is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
|
is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
|
||||||
|
|
||||||
@ -173,9 +183,8 @@ class IliasPage:
|
|||||||
|
|
||||||
query_params = {f"tbl_xoct_{table_id}_trows": "800",
|
query_params = {f"tbl_xoct_{table_id}_trows": "800",
|
||||||
"cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
"cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||||
return [IliasPageElement(
|
url = _url_set_query_params(self._page_url, query_params)
|
||||||
IliasElementType.VIDEO_FOLDER, self._page_url, "", query_parameter=query_params
|
return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")]
|
||||||
)]
|
|
||||||
|
|
||||||
def _find_video_entries_no_paging(self) -> List[IliasPageElement]:
|
def _find_video_entries_no_paging(self) -> List[IliasPageElement]:
|
||||||
"""
|
"""
|
||||||
@ -363,6 +372,7 @@ class IliasPage:
|
|||||||
"""
|
"""
|
||||||
return urljoin(self._page_url, link_tag.get("href"))
|
return urljoin(self._page_url, link_tag.get("href"))
|
||||||
|
|
||||||
|
|
||||||
def demangle_date(date_str: str) -> Optional[datetime]:
|
def demangle_date(date_str: str) -> Optional[datetime]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -371,6 +381,36 @@ def _sanitize_path_name(name: str) -> str:
|
|||||||
return name.replace("/", "-").replace("\\", "-").strip()
|
return name.replace("/", "-").replace("\\", "-").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _url_set_query_param(url: str, param: str, value: str) -> str:
|
||||||
|
"""
|
||||||
|
Set a query parameter in an url, overwriting existing ones with the same name.
|
||||||
|
"""
|
||||||
|
scheme, netloc, path, query, fragment = urlsplit(url)
|
||||||
|
query_parameters = parse_qs(query)
|
||||||
|
query_parameters[param] = [value]
|
||||||
|
new_query_string = urlencode(query_parameters, doseq=True)
|
||||||
|
|
||||||
|
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
|
||||||
|
|
||||||
|
|
||||||
|
def _url_set_query_params(url: str, params: Dict[str, str]) -> str:
|
||||||
|
result = url
|
||||||
|
|
||||||
|
for key, val in params.items():
|
||||||
|
result = _url_set_query_param(result, key, val)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
_DIRECTORY_PAGES: Set[IliasElementType] = set([
|
||||||
|
IliasElementType.EXERCISE,
|
||||||
|
IliasElementType.FOLDER,
|
||||||
|
IliasElementType.MEETING,
|
||||||
|
IliasElementType.VIDEO_FOLDER,
|
||||||
|
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
class IliasCrawler(HttpCrawler):
|
class IliasCrawler(HttpCrawler):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -386,22 +426,104 @@ class IliasCrawler(HttpCrawler):
|
|||||||
section.auth(authenticators),
|
section.auth(authenticators),
|
||||||
section.tfa_auth(authenticators)
|
section.tfa_auth(authenticators)
|
||||||
)
|
)
|
||||||
self._base_url = section.base_url()
|
self._base_url = "https://ilias.studium.kit.edu"
|
||||||
|
|
||||||
self._course_id = section.course_id()
|
self._target = section.target()
|
||||||
self._element_url = section.element_url()
|
|
||||||
|
|
||||||
async def crawl(self) -> None:
|
async def crawl(self) -> None:
|
||||||
async with self.crawl_bar(PurePath("/")) as bar:
|
if isinstance(self._target, int):
|
||||||
soup = await self._get_page(self._base_url)
|
await self._crawl_course(self._target)
|
||||||
page = IliasPage(soup, self._base_url, None)
|
elif self._target == "desktop":
|
||||||
for element in page.get_child_elements():
|
await self._crawl_desktop()
|
||||||
self.print(element.name + " " + str(element.type))
|
else:
|
||||||
|
await self._crawl_url(self._target)
|
||||||
|
|
||||||
|
async def _crawl_course(self, course_id: int) -> None:
|
||||||
|
# Start crawling at the given course
|
||||||
|
root_url = _url_set_query_param(
|
||||||
|
self._base_url + "/goto.php", "target", f"crs_{course_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
await self._crawl_url(root_url, expected_id=course_id)
|
||||||
|
|
||||||
|
async def _crawl_desktop(self) -> None:
|
||||||
|
await self._crawl_url(self._base_url)
|
||||||
|
|
||||||
|
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
async with self.crawl_bar(PurePath("Root element")):
|
||||||
|
soup = await self._get_page(url)
|
||||||
|
|
||||||
|
if expected_id is not None:
|
||||||
|
perma_link_element: Tag = soup.find(id="current_perma_link")
|
||||||
|
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
|
||||||
|
# TODO: Properly handle error
|
||||||
|
raise RuntimeError(
|
||||||
|
"Invalid course id? I didn't find anything looking like a course!")
|
||||||
|
|
||||||
|
# Duplicated code, but the root page is special - we want to void fetching it twice!
|
||||||
|
page = IliasPage(soup, url, None)
|
||||||
|
for child in page.get_child_elements():
|
||||||
|
tasks.append(self._handle_ilias_element(PurePath("."), child))
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
|
||||||
|
tasks = []
|
||||||
|
async with self.crawl_bar(path):
|
||||||
|
soup = await self._get_page(url)
|
||||||
|
page = IliasPage(soup, url, parent)
|
||||||
|
|
||||||
|
for child in page.get_child_elements():
|
||||||
|
tasks.append(self._handle_ilias_element(path, child))
|
||||||
|
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
|
||||||
|
element_path = PurePath(parent_path, element.name)
|
||||||
|
|
||||||
|
if element.type == IliasElementType.FILE:
|
||||||
|
await self._download_element(element, element_path)
|
||||||
|
elif element.type == IliasElementType.FORUM:
|
||||||
|
# TODO: Delete
|
||||||
|
self.print(f"Skipping forum [green]{element_path}[/]")
|
||||||
|
elif element.type == IliasElementType.LINK:
|
||||||
|
# TODO: Write in meta-redirect file
|
||||||
|
self.print(f"Skipping link [green]{element_path}[/]")
|
||||||
|
elif element.type == IliasElementType.VIDEO:
|
||||||
|
await self._download_element(element, element_path)
|
||||||
|
elif element.type == IliasElementType.VIDEO_PLAYER:
|
||||||
|
# FIXME: Check if we should look at this and if not bail out already!
|
||||||
|
# This saves us a request for each video, if we skip them anyways
|
||||||
|
raise RuntimeError("IMPLEMENT ME")
|
||||||
|
elif element.type in _DIRECTORY_PAGES:
|
||||||
|
await self._handle_ilias_page(element.url, element, element_path)
|
||||||
|
else:
|
||||||
|
# TODO: Proper exception
|
||||||
|
raise RuntimeError(f"Unknown type: {element.type!r}")
|
||||||
|
|
||||||
|
async def _download_element(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||||
|
dl = await self.download(element_path, mtime=element.mtime)
|
||||||
|
if not dl:
|
||||||
|
return
|
||||||
|
|
||||||
|
async with self.download_bar(element_path) as bar, dl as sink,\
|
||||||
|
self.session.get(element.url) as resp:
|
||||||
|
|
||||||
|
if resp.content_length:
|
||||||
|
bar.set_total(resp.content_length)
|
||||||
|
|
||||||
|
async for data in resp.content.iter_chunked(1024):
|
||||||
|
sink.file.write(data)
|
||||||
|
bar.advance(len(data))
|
||||||
|
|
||||||
|
sink.done()
|
||||||
|
|
||||||
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
|
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
|
||||||
if retries_left < 0:
|
if retries_left < 0:
|
||||||
# TODO: Proper exception
|
# TODO: Proper exception
|
||||||
raise RuntimeError("Get page failed too often")
|
raise RuntimeError("Get page failed too often")
|
||||||
|
print(url)
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
|
Loading…
Reference in New Issue
Block a user