WIP: ilias-crawler: Demangle dates

This commit is contained in:
I-Al-Istannen 2020-04-22 12:44:41 +02:00
parent ac65b06a8e
commit 23db59e733
2 changed files with 40 additions and 9 deletions

View File

@ -16,6 +16,7 @@ import bs4
from ..cookie_jar import CookieJar from ..cookie_jar import CookieJar
from ..utils import soupify from ..utils import soupify
from .authenticators import IliasAuthenticator from .authenticators import IliasAuthenticator
from .date_demangler import demangle_date
from .downloader import IliasDownloadInfo from .downloader import IliasDownloadInfo
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
@ -102,10 +103,17 @@ class IliasCrawler:
).select_one(".il_ItemProperties") ).select_one(".il_ItemProperties")
file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip()
modifcation_date = datetime.datetime.now()
all_properties_text = properties_parent.getText().strip() all_properties_text = properties_parent.getText().strip()
print("Property text is", all_properties_text) modification_date_match = re.search(
# todo demangle date from text above r"(((\d+\. \w+ \d+)|(Gestern)|(Heute)), \d+:\d+)",
all_properties_text
)
if modification_date_match is None:
modification_date = datetime.datetime.now()
LOGGER.warning("Could not extract start date from %r", all_properties_text)
else:
modification_date_str = modification_date_match.group(1)
modification_date = demangle_date(modification_date_str)
name = link_element.getText() name = link_element.getText()
full_path = Path(path, name + "." + file_type) full_path = Path(path, name + "." + file_type)
@ -116,7 +124,7 @@ class IliasCrawler:
LOGGER.warning("Could not download file %r", url) LOGGER.warning("Could not download file %r", url)
return [] return []
return [IliasDownloadInfo(full_path, url, modifcation_date)] return [IliasDownloadInfo(full_path, url, modification_date)]
def _switch_on_folder_like( def _switch_on_folder_like(
self, self,
@ -184,6 +192,7 @@ class IliasCrawler:
title = link.parent.parent.parent.select_one( title = link.parent.parent.parent.select_one(
"td.std:nth-child(3)" "td.std:nth-child(3)"
).getText().strip() ).getText().strip()
title += ".mp4"
video_page_soup = self._get_page(video_page_url, {}) video_page_soup = self._get_page(video_page_url, {})
regex: re.Pattern = re.compile( regex: re.Pattern = re.compile(
@ -243,7 +252,7 @@ class IliasCrawler:
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:
userlog = soup.find("li", {"id": "userlog"}) userlog = soup.find("li", {"id": "userlog"})
if userlog is not None: if userlog is not None:
print("Found userlog") LOGGER.debug("Auth: Found #userlog")
return True return True
video_table = soup.find( video_table = soup.find(
recursive=True, recursive=True,
@ -251,19 +260,17 @@ class IliasCrawler:
attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
) )
if video_table is not None: if video_table is not None:
print("Found video") LOGGER.debug("Auth: Found #tbl_xoct.+")
return True return True
if soup.select_one("#playerContainer") is not None: if soup.select_one("#playerContainer") is not None:
print("Found player") LOGGER.debug("Auth: Found #playerContainer")
return True return True
print("Ooops: ", soup)
return False return False
def run_as_test(ilias_url: str, course_id: int) -> List[IliasDownloadInfo]: def run_as_test(ilias_url: str, course_id: int) -> List[IliasDownloadInfo]:
from ..organizer import Organizer from ..organizer import Organizer
from .authenticators import KitShibbolethAuthenticator from .authenticators import KitShibbolethAuthenticator
organizer = Organizer(Path("/tmp/test/inner"))
crawler = IliasCrawler(KitShibbolethAuthenticator(), ilias_url, str(course_id)) crawler = IliasCrawler(KitShibbolethAuthenticator(), ilias_url, str(course_id))
return crawler.crawl() return crawler.crawl()

View File

@ -0,0 +1,24 @@
"""
Helper methods to demangle an ILIAS date.
"""
import datetime
import re
def demangle_date(date: str) -> datetime.datetime:
"""
Demangle a given date in one of the following formats:
"Gestern, HH:MM"
"Heute, HH:MM"
"dd. mon.yyyy, HH:MM
"""
date = re.sub(r"\s+", " ", date)
date = date.replace("Gestern", _yesterday().strftime("%d. %b %Y"))
date = date.replace("Heute", datetime.date.today().strftime("%d. %b %Y"))
return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M")
def _yesterday() -> datetime.date:
return datetime.date.today() - datetime.timedelta(days=1)