Handle exercise detail containers in ILIAS html parser

This commit is contained in:
I-Al-Istannen 2021-05-24 16:22:51 +02:00
parent d44f6966c2
commit 342076ee0e
2 changed files with 62 additions and 4 deletions

View File

@ -16,6 +16,7 @@ TargetType = Union[str, int]
class IliasElementType(Enum): class IliasElementType(Enum):
EXERCISE = "exercise" EXERCISE = "exercise"
EXERCISE_FILES = "exercise_files" # own submitted files
FILE = "file" FILE = "file"
FOLDER = "folder" FOLDER = "folder"
FORUM = "forum" FORUM = "forum"
@ -197,6 +198,43 @@ class IliasPage:
return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
def _find_exercise_entries(self) -> List[IliasPageElement]: def _find_exercise_entries(self) -> List[IliasPageElement]:
if self._soup.find(id="tab_submission"):
log.explain("Found submission tab. This is an exercise detail page")
return self._find_exercise_entries_detail_page()
log.explain("Found no submission tab. This is an exercise root page")
return self._find_exercise_entries_root_page()
def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]:
results: List[IliasPageElement] = []
# Find all download links in the container (this will contain all the files)
download_links: List[Tag] = self._soup.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmd=download" in x},
text="Download"
)
for link in download_links:
parent_row: Tag = link.findParent("tr")
children: List[Tag] = parent_row.findChildren("td")
# <checkbox> <name> <uploader> <date> <download>
# 0 1 2 3 4
name = _sanitize_path_name(children[1].getText().strip())
date = demangle_date(children[3].getText().strip())
log.explain(f"Found exercise detail entry {name!r}")
results.append(IliasPageElement(
IliasElementType.FILE,
self._abs_url_from_link(link),
name,
date
))
return results
def _find_exercise_entries_root_page(self) -> List[IliasPageElement]:
results: List[IliasPageElement] = [] results: List[IliasPageElement] = []
# Each assignment is in an accordion container # Each assignment is in an accordion container
@ -205,6 +243,8 @@ class IliasPage:
for container in assignment_containers: for container in assignment_containers:
# Fetch the container name out of the header to use it in the path # Fetch the container name out of the header to use it in the path
container_name = container.select_one(".ilAssignmentHeader").getText().strip() container_name = container.select_one(".ilAssignmentHeader").getText().strip()
log.explain(f"Found exercise container {container_name!r}")
# Find all download links in the container (this will contain all the files) # Find all download links in the container (this will contain all the files)
files: List[Tag] = container.findAll( files: List[Tag] = container.findAll(
name="a", name="a",
@ -213,8 +253,6 @@ class IliasPage:
text="Download" text="Download"
) )
log.explain(f"Found exercise container {container_name!r}")
# Grab each file as you now have the link # Grab each file as you now have the link
for file_link in files: for file_link in files:
# Two divs, side by side. Left is the name, right is the link ==> get left # Two divs, side by side. Left is the name, right is the link ==> get left
@ -231,6 +269,25 @@ class IliasPage:
None # We do not have any timestamp None # We do not have any timestamp
)) ))
# Find all links to file listings (e.g. "Submitted Files" for groups)
file_listings: List[Tag] = container.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
)
# Add each listing as a new
for listing in file_listings:
file_name = _sanitize_path_name(listing.getText().strip())
url = self._abs_url_from_link(listing)
log.explain(f"Found exercise detail {file_name!r} at {url}")
results.append(IliasPageElement(
IliasElementType.EXERCISE_FILES,
url,
container_name + "/" + file_name,
None # we do not have any timestamp
))
return results return results
def _find_normal_entries(self) -> List[IliasPageElement]: def _find_normal_entries(self) -> List[IliasPageElement]:
@ -349,7 +406,7 @@ class IliasPage:
if found_parent is None: if found_parent is None:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url!r}") log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}")
return None return None
# Find the small descriptive icon to figure out the type # Find the small descriptive icon to figure out the type
@ -357,7 +414,7 @@ class IliasPage:
if img_tag is None: if img_tag is None:
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Tried to figure out element type, but did not find an image for {url!r}") log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
return None return None
if "opencast" in str(img_tag["alt"]).lower(): if "opencast" in str(img_tag["alt"]).lower():

View File

@ -61,6 +61,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
_DIRECTORY_PAGES: Set[IliasElementType] = set([ _DIRECTORY_PAGES: Set[IliasElementType] = set([
IliasElementType.EXERCISE, IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES,
IliasElementType.FOLDER, IliasElementType.FOLDER,
IliasElementType.MEETING, IliasElementType.MEETING,
IliasElementType.VIDEO_FOLDER, IliasElementType.VIDEO_FOLDER,