WIP: ilias crawler: Also crawl assignments

This commit is contained in:
I-Al-Istannen 2020-04-22 14:32:20 +02:00
parent 23db59e733
commit eb7df036df
2 changed files with 56 additions and 8 deletions

View File

@ -78,6 +78,7 @@ class IliasCrawler:
LOGGER.debug("Parsed url: %r", parsed_url)
if "target=file_" in parsed_url.query:
LOGGER.debug("Interpreted as file.")
return self._crawl_file(path, link_element, url)
# Skip forums
@ -153,14 +154,18 @@ class IliasCrawler:
LOGGER.debug("Skipping forum at %r", url)
return []
element_path = Path(path, link_element.getText().strip())
if str(img_tag["src"]).endswith("icon_exc.svg"):
LOGGER.debug("Crawling exercises at %r", url)
return self._crawl_exercises(element_path, url)
if "opencast" in str(img_tag["alt"]).lower():
LOGGER.debug("Found video site: %r", url)
return self._crawl_video_directory(path, url)
return self._crawl_video_directory(element_path, url)
# Assume it is a folder
folder_name = link_element.getText()
folder_path = Path(path, folder_name)
return self._crawl_folder(folder_path, self._abs_url_from_link(link_element))
return self._crawl_folder(element_path, self._abs_url_from_link(link_element))
def _crawl_video_directory(self, path: Path, url: str) -> List[IliasDownloadInfo]:
initial_soup = self._get_page(url, {})
@ -210,6 +215,43 @@ class IliasCrawler:
return [IliasDownloadInfo(Path(path, title), video_url, modification_time)]
def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasDownloadInfo]:
soup = self._get_page(url, {})
results: List[IliasDownloadInfo] = []
assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer")
for container in assignment_containers:
container_name = container.select_one(".ilAssignmentHeader").getText().strip()
files: List[bs4.Tag] = container.findAll(
name="a",
attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x},
text="Download"
)
LOGGER.debug("Found exercise container %r", container_name)
end_date: datetime.datetime = datetime.datetime.now()
end_date_header: bs4.Tag = container.find(name="div", text="Abgabetermin")
if end_date_header is not None:
end_date_text = end_date_header.findNext("div").getText().strip()
end_date = demangle_date(end_date_text)
for file_link in files:
file_name = file_link.parent.findPrevious(name="div").getText().strip()
url = self._abs_url_from_link(file_link)
LOGGER.debug("Found file %r at %r", file_name, url)
results.append(IliasDownloadInfo(
Path(element_path, container_name, file_name),
url,
end_date
))
return results
def _crawl_folder(self, path: Path, url: str) -> List[IliasDownloadInfo]:
soup = self._get_page(url, {})

View File

@ -3,6 +3,7 @@ Helper methods to demangle an ILIAS date.
"""
import datetime
import locale
import re
@ -13,11 +14,16 @@ def demangle_date(date: str) -> datetime.datetime:
"Heute, HH:MM"
"dd. mon.yyyy, HH:MM
"""
date = re.sub(r"\s+", " ", date)
date = date.replace("Gestern", _yesterday().strftime("%d. %b %Y"))
date = date.replace("Heute", datetime.date.today().strftime("%d. %b %Y"))
saved = locale.setlocale(locale.LC_ALL)
try:
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M")
date = re.sub(r"\s+", " ", date)
date = date.replace("Gestern", _yesterday().strftime("%d. %b %Y"))
date = date.replace("Heute", datetime.date.today().strftime("%d. %b %Y"))
return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M")
finally:
locale.setlocale(locale.LC_ALL, saved)
def _yesterday() -> datetime.date: