diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d93684c..6d063b6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -746,17 +746,26 @@ class IliasPage: Normalizes meeting names, which have a relative time as their first part, to their date in ISO format. """ - date_portion_str = meeting_name.split(" - ")[0] + + # This checks whether we can reach a `:` without passing a `-` + if re.search(r"^[^-]+: ", meeting_name): + # Meeting name only contains date: "05. Jan 2000:" + split_delimiter = ":" + else: + # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" + split_delimiter = ", " + + # We have a meeting day without time + date_portion_str = meeting_name.split(split_delimiter)[0] date_portion = demangle_date(date_portion_str) + # We failed to parse the date, bail out if not date_portion: return meeting_name - rest_of_name = meeting_name - if rest_of_name.startswith(date_portion_str): - rest_of_name = rest_of_name[len(date_portion_str):] - - return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + # Replace the first section with the absolute date + rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) + return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name def _abs_url_from_link(self, link_tag: Tag) -> str: """ @@ -781,17 +790,15 @@ english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: """ - Demangle a given date in one of the following formats: + Demangle a given date in one of the following formats (hour/minute part is optional): "Gestern, HH:MM" "Heute, HH:MM" "Morgen, HH:MM" "dd. mon yyyy, HH:MM """ try: + # Normalize whitespace because users date_str = re.sub(r"\s+", " ", date_str) - date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) - date_str = re.sub("(Heute|Today):", "", date_str, re.I) - date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) @@ -802,19 +809,28 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" date_str = date_str.replace(english + ".", english) - # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" - day_part, time_part = date_str.split(",") + # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" + + # Check if we have a time as well + if ", " in date_str: + day_part, time_part = date_str.split(",") + else: + day_part = date_str.split(",")[0] + time_part = None + day_str, month_str, year_str = day_part.split(" ") day = int(day_str.strip().replace(".", "")) month = english_months.index(month_str.strip()) + 1 year = int(year_str.strip()) - hour_str, minute_str = time_part.split(":") - hour = int(hour_str) - minute = int(minute_str) + if time_part: + hour_str, minute_str = time_part.split(":") + hour = int(hour_str) + minute = int(minute_str) + return datetime(year, month, day, hour, minute) - return datetime(year, month, day, hour, minute) + return datetime(year, month, day) except Exception: if not fail_silently: log.warn(f"Date parsing failed for {date_str!r}")