From 23db59e733ec2fb5d76b915fb79540aaa7531f47 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 22 Apr 2020 12:44:41 +0200 Subject: [PATCH] WIP: ilias-crawler: Demangle dates --- PFERD/ilias/crawler.py | 25 ++++++++++++++++--------- PFERD/ilias/date_demangler.py | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 PFERD/ilias/date_demangler.py diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 451b601..6fecd57 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -16,6 +16,7 @@ import bs4 from ..cookie_jar import CookieJar from ..utils import soupify from .authenticators import IliasAuthenticator +from .date_demangler import demangle_date from .downloader import IliasDownloadInfo LOGGER = logging.getLogger(__name__) @@ -102,10 +103,17 @@ class IliasCrawler: ).select_one(".il_ItemProperties") file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() - modifcation_date = datetime.datetime.now() all_properties_text = properties_parent.getText().strip() - print("Property text is", all_properties_text) - # todo demangle date from text above + modification_date_match = re.search( + r"(((\d+\. \w+ \d+)|(Gestern)|(Heute)), \d+:\d+)", + all_properties_text + ) + if modification_date_match is None: + modification_date = datetime.datetime.now() + LOGGER.warning("Could not extract start date from %r", all_properties_text) + else: + modification_date_str = modification_date_match.group(1) + modification_date = demangle_date(modification_date_str) name = link_element.getText() full_path = Path(path, name + "." + file_type) @@ -116,7 +124,7 @@ class IliasCrawler: LOGGER.warning("Could not download file %r", url) return [] - return [IliasDownloadInfo(full_path, url, modifcation_date)] + return [IliasDownloadInfo(full_path, url, modification_date)] def _switch_on_folder_like( self, @@ -184,6 +192,7 @@ class IliasCrawler: title = link.parent.parent.parent.select_one( "td.std:nth-child(3)" ).getText().strip() + title += ".mp4" video_page_soup = self._get_page(video_page_url, {}) regex: re.Pattern = re.compile( @@ -243,7 +252,7 @@ class IliasCrawler: def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: userlog = soup.find("li", {"id": "userlog"}) if userlog is not None: - print("Found userlog") + LOGGER.debug("Auth: Found #userlog") return True video_table = soup.find( recursive=True, @@ -251,19 +260,17 @@ class IliasCrawler: attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} ) if video_table is not None: - print("Found video") + LOGGER.debug("Auth: Found #tbl_xoct.+") return True if soup.select_one("#playerContainer") is not None: - print("Found player") + LOGGER.debug("Auth: Found #playerContainer") return True - print("Ooops: ", soup) return False def run_as_test(ilias_url: str, course_id: int) -> List[IliasDownloadInfo]: from ..organizer import Organizer from .authenticators import KitShibbolethAuthenticator - organizer = Organizer(Path("/tmp/test/inner")) crawler = IliasCrawler(KitShibbolethAuthenticator(), ilias_url, str(course_id)) return crawler.crawl() diff --git a/PFERD/ilias/date_demangler.py b/PFERD/ilias/date_demangler.py new file mode 100644 index 0000000..1043f09 --- /dev/null +++ b/PFERD/ilias/date_demangler.py @@ -0,0 +1,24 @@ +""" +Helper methods to demangle an ILIAS date. +""" + +import datetime +import re + + +def demangle_date(date: str) -> datetime.datetime: + """ + Demangle a given date in one of the following formats: + "Gestern, HH:MM" + "Heute, HH:MM" + "dd. mon.yyyy, HH:MM + """ + date = re.sub(r"\s+", " ", date) + date = date.replace("Gestern", _yesterday().strftime("%d. %b %Y")) + date = date.replace("Heute", datetime.date.today().strftime("%d. %b %Y")) + + return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M") + + +def _yesterday() -> datetime.date: + return datetime.date.today() - datetime.timedelta(days=1)