mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Implement date-demangling in ILIAS crawler
This commit is contained in:
parent
cf6903d109
commit
9ec0d3e16a
@ -3,7 +3,7 @@ import json
|
|||||||
import re
|
import re
|
||||||
from configparser import SectionProxy
|
from configparser import SectionProxy
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import date, datetime, timedelta
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
|
||||||
@ -424,10 +424,56 @@ class IliasPage:
|
|||||||
"""
|
"""
|
||||||
return urljoin(self._page_url, link_tag.get("href"))
|
return urljoin(self._page_url, link_tag.get("href"))
|
||||||
|
|
||||||
|
german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez']
|
||||||
|
english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||||
|
|
||||||
def demangle_date(date_str: str) -> Optional[datetime]:
|
def demangle_date(date_str: str) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
Demangle a given date in one of the following formats:
|
||||||
|
"Gestern, HH:MM"
|
||||||
|
"Heute, HH:MM"
|
||||||
|
"Morgen, HH:MM"
|
||||||
|
"dd. mon yyyy, HH:MM
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
date_str = re.sub(r"\s+", " ", date_str)
|
||||||
|
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
|
||||||
|
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
|
||||||
|
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
|
||||||
|
for german, english in zip(german_months, english_months):
|
||||||
|
date_str = date_str.replace(german, english)
|
||||||
|
# Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020"
|
||||||
|
date_str = date_str.replace(english + ".", english)
|
||||||
|
|
||||||
|
# We now have a nice english String in the format: "dd. mmm yyyy, hh:mm"
|
||||||
|
day_part, time_part = date_str.split(",")
|
||||||
|
day_str, month_str, year_str = day_part.split(" ")
|
||||||
|
|
||||||
|
day = int(day_str.strip().replace(".", ""))
|
||||||
|
month = english_months.index(month_str.strip()) + 1
|
||||||
|
year = int(year_str.strip())
|
||||||
|
|
||||||
|
hour_str, minute_str = time_part.split(":")
|
||||||
|
hour = int(hour_str)
|
||||||
|
minute = int(minute_str)
|
||||||
|
|
||||||
|
return datetime(year, month, day, hour, minute)
|
||||||
|
except Exception:
|
||||||
|
# TODO: Properly log this
|
||||||
|
print(f"Could not parse date {date_str!r}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _format_date_english(date: date) -> str:
|
||||||
|
month = english_months[date.month - 1]
|
||||||
|
return f"{date.day:02d}. {month} {date.year:04d}"
|
||||||
|
|
||||||
|
def _yesterday() -> date:
|
||||||
|
return date.today() - timedelta(days=1)
|
||||||
|
|
||||||
|
|
||||||
|
def _tomorrow() -> date:
|
||||||
|
return date.today() + timedelta(days=1)
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_path_name(name: str) -> str:
|
def _sanitize_path_name(name: str) -> str:
|
||||||
return name.replace("/", "-").replace("\\", "-").strip()
|
return name.replace("/", "-").replace("\\", "-").strip()
|
||||||
|
Loading…
Reference in New Issue
Block a user