Add support for exercises in ILIAS crawler

This commit is contained in:
I-Al-Istannen 2021-05-15 21:40:17 +02:00
parent 7d323ec62b
commit c454fabc9d

View File

@ -93,6 +93,8 @@ class IliasPage:
return self._player_to_video() return self._player_to_video()
if self._is_video_listing(): if self._is_video_listing():
return self._find_video_entries() return self._find_video_entries()
if self._is_exercise_file():
return self._find_exercise_entries()
return self._find_normal_entries() return self._find_normal_entries()
def _is_video_player(self) -> bool: def _is_video_player(self) -> bool:
@ -111,6 +113,19 @@ class IliasPage:
) )
return video_element_table is not None return video_element_table is not None
def _is_exercise_file(self) -> bool:
# we know it from before
if self._page_type == IliasElementType.EXERCISE:
return True
# We have no suitable parent - let's guesss
if self._soup.find(id="headerimage"):
element: Tag = self._soup.find(id="headerimage")
if "exc" in element.attrs["src"].lower():
return True
return False
def _player_to_video(self) -> List[IliasPageElement]: def _player_to_video(self) -> List[IliasPageElement]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript # Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere # player. Sadly we can not execute that JS. The actual video stream url is nowhere
@ -223,6 +238,40 @@ class IliasPage:
return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
def _find_exercise_entries(self) -> List[IliasPageElement]:
results: List[IliasPageElement] = []
# Each assignment is in an accordion container
assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer")
for container in assignment_containers:
# Fetch the container name out of the header to use it in the path
container_name = container.select_one(".ilAssignmentHeader").getText().strip()
# Find all download links in the container (this will contain all the files)
files: List[Tag] = container.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x},
text="Download"
)
# Grab each file as you now have the link
for file_link in files:
# Two divs, side by side. Left is the name, right is the link ==> get left
# sibling
file_name = file_link.parent.findPrevious(name="div").getText().strip()
file_name = _sanitize_path_name(file_name)
url = self._abs_url_from_link(file_link)
results.append(IliasPageElement(
IliasElementType.FILE,
url,
container_name + "/" + file_name,
None # We do not have any timestamp
))
return results
def _find_normal_entries(self) -> List[IliasPageElement]: def _find_normal_entries(self) -> List[IliasPageElement]:
result: List[IliasPageElement] = [] result: List[IliasPageElement] = []