mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Fix personal desktop crawling
This commit is contained in:
parent
462d993fbc
commit
6f3cfd4396
@ -39,7 +39,12 @@ class IliasPageElement:
|
|||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
|
|
||||||
def id(self) -> str:
|
def id(self) -> str:
|
||||||
regexes = [r"eid=(?P<id>[0-9a-z\-]+)", r"file_(?P<id>\d+)", r"ref_id=(?P<id>\d+)"]
|
regexes = [
|
||||||
|
r"eid=(?P<id>[0-9a-z\-]+)",
|
||||||
|
r"file_(?P<id>\d+)",
|
||||||
|
r"ref_id=(?P<id>\d+)",
|
||||||
|
r"target=[a-z]+_(?P<id>\d+)"
|
||||||
|
]
|
||||||
|
|
||||||
for regex in regexes:
|
for regex in regexes:
|
||||||
if match := re.search(regex, self.url):
|
if match := re.search(regex, self.url):
|
||||||
@ -71,6 +76,9 @@ class IliasPage:
|
|||||||
if self._is_exercise_file():
|
if self._is_exercise_file():
|
||||||
log.explain("Page is an exercise, searching for elements")
|
log.explain("Page is an exercise, searching for elements")
|
||||||
return self._find_exercise_entries()
|
return self._find_exercise_entries()
|
||||||
|
if self._is_personal_desktop():
|
||||||
|
log.explain("Page is the personal desktop")
|
||||||
|
return self._find_personal_desktop_entries()
|
||||||
log.explain("Page is a normal folder, searching for elements")
|
log.explain("Page is a normal folder, searching for elements")
|
||||||
return self._find_normal_entries()
|
return self._find_normal_entries()
|
||||||
|
|
||||||
@ -115,6 +123,9 @@ class IliasPage:
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _is_personal_desktop(self) -> bool:
|
||||||
|
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
|
||||||
|
|
||||||
def _player_to_video(self) -> List[IliasPageElement]:
|
def _player_to_video(self) -> List[IliasPageElement]:
|
||||||
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
||||||
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
||||||
@ -149,6 +160,26 @@ class IliasPage:
|
|||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
def _find_personal_desktop_entries(self) -> List[IliasPageElement]:
|
||||||
|
items: List[IliasPageElement] = []
|
||||||
|
|
||||||
|
titles: List[Tag] = self._soup.select(".il-item-title")
|
||||||
|
for title in titles:
|
||||||
|
link = title.find("a")
|
||||||
|
name = _sanitize_path_name(link.text.strip())
|
||||||
|
url = self._abs_url_from_link(link)
|
||||||
|
|
||||||
|
type = self._find_type_from_link(name, link, url)
|
||||||
|
if not type:
|
||||||
|
_unexpected_html_warning()
|
||||||
|
log.warn_contd(f"Could not extract type for {link}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
log.explain(f"Found {name!r}")
|
||||||
|
items.append(IliasPageElement(type, url, name))
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
def _find_video_entries(self) -> List[IliasPageElement]:
|
def _find_video_entries(self) -> List[IliasPageElement]:
|
||||||
# ILIAS has three stages for video pages
|
# ILIAS has three stages for video pages
|
||||||
# 1. The initial dummy page without any videos. This page contains the link to the listing
|
# 1. The initial dummy page without any videos. This page contains the link to the listing
|
||||||
@ -551,9 +582,30 @@ class IliasPage:
|
|||||||
if "target=file_" in parsed_url.query:
|
if "target=file_" in parsed_url.query:
|
||||||
return IliasElementType.FILE
|
return IliasElementType.FILE
|
||||||
|
|
||||||
|
if "target=grp_" in parsed_url.query:
|
||||||
|
return IliasElementType.FOLDER
|
||||||
|
|
||||||
|
if "target=crs_" in parsed_url.query:
|
||||||
|
return IliasElementType.FOLDER
|
||||||
|
|
||||||
|
if "baseClass=ilExerciseHandlerGUI" in parsed_url.query:
|
||||||
|
return IliasElementType.EXERCISE
|
||||||
|
|
||||||
|
if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query:
|
||||||
|
return IliasElementType.LINK
|
||||||
|
|
||||||
|
if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query:
|
||||||
|
return IliasElementType.FORUM
|
||||||
|
|
||||||
|
if "cmdClass=ilobjtestgui" in parsed_url.query:
|
||||||
|
return IliasElementType.TEST
|
||||||
|
|
||||||
|
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
||||||
|
# try to guess it from the image.
|
||||||
|
|
||||||
# Everything with a ref_id can *probably* be opened to reveal nested things
|
# Everything with a ref_id can *probably* be opened to reveal nested things
|
||||||
# video groups, directories, exercises, etc
|
# video groups, directories, exercises, etc
|
||||||
if "ref_id=" in parsed_url.query:
|
if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path:
|
||||||
return IliasPage._find_type_from_folder_like(link_element, url)
|
return IliasPage._find_type_from_folder_like(link_element, url)
|
||||||
|
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
@ -574,7 +626,7 @@ class IliasPage:
|
|||||||
# We look for the outer div of our inner link, to find information around it
|
# We look for the outer div of our inner link, to find information around it
|
||||||
# (mostly the icon)
|
# (mostly the icon)
|
||||||
for parent in link_element.parents:
|
for parent in link_element.parents:
|
||||||
if "ilContainerListItemOuter" in parent["class"]:
|
if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]:
|
||||||
found_parent = parent
|
found_parent = parent
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -586,6 +638,9 @@ class IliasPage:
|
|||||||
# Find the small descriptive icon to figure out the type
|
# Find the small descriptive icon to figure out the type
|
||||||
img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon")
|
img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon")
|
||||||
|
|
||||||
|
if img_tag is None:
|
||||||
|
img_tag = found_parent.select_one("img.icon")
|
||||||
|
|
||||||
if img_tag is None:
|
if img_tag is None:
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
|
log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
|
||||||
|
@ -203,7 +203,9 @@ instance's greatest bottleneck.
|
|||||||
await self._crawl_url(root_url, expected_id=course_id)
|
await self._crawl_url(root_url, expected_id=course_id)
|
||||||
|
|
||||||
async def _crawl_desktop(self) -> None:
|
async def _crawl_desktop(self) -> None:
|
||||||
await self._crawl_url(self._base_url)
|
appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
|
||||||
|
appendix = appendix.encode("ASCII").hex()
|
||||||
|
await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)
|
||||||
|
|
||||||
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
||||||
maybe_cl = await self.crawl(PurePath("."))
|
maybe_cl = await self.crawl(PurePath("."))
|
||||||
@ -622,6 +624,11 @@ instance's greatest bottleneck.
|
|||||||
if mainbar is not None:
|
if mainbar is not None:
|
||||||
login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x})
|
login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x})
|
||||||
return not login_button
|
return not login_button
|
||||||
|
|
||||||
|
# Personal Desktop
|
||||||
|
if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
|
||||||
|
return True
|
||||||
|
|
||||||
# Video listing embeds do not have complete ILIAS html. Try to match them by
|
# Video listing embeds do not have complete ILIAS html. Try to match them by
|
||||||
# their video listing table
|
# their video listing table
|
||||||
video_table = soup.find(
|
video_table = soup.find(
|
||||||
|
Loading…
Reference in New Issue
Block a user