From 458cc1c6d62af007c1eb71ccad56652930980f88 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 15 Oct 2019 15:34:59 +0200 Subject: [PATCH 1/2] Add support for TGI website --- PFERD/__init__.py | 2 ++ PFERD/tgi.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 PFERD/tgi.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index cd7ae16..5ed1ad9 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -3,6 +3,7 @@ import logging from .ffm import * from .ilias import * from .norbert import * +from .tgi import * from .ti import * from .utils import * @@ -11,6 +12,7 @@ __all__ = ["STYLE", "FORMAT", "DATE_FORMAT", "FORMATTER", "enable_logging"] __all__ += ffm.__all__ __all__ += ilias.__all__ __all__ += norbert.__all__ +__all__ += tgi.__all__ __all__ += ti.__all__ __all__ += utils.__all__ diff --git a/PFERD/tgi.py b/PFERD/tgi.py new file mode 100644 index 0000000..e6c9e09 --- /dev/null +++ b/PFERD/tgi.py @@ -0,0 +1,75 @@ +# TGI Lecture slides + +import logging +import pathlib +import re +import zipfile + +import bs4 +import requests + +from .organizer import Organizer +from .utils import rename, stream_to_path, PrettyLogger + +__all__ = ["TGI"] +logger = logging.getLogger(__name__) +pretty = PrettyLogger(logger) + +class TGI: + CRAWL_URL = "https://i11www.iti.kit.edu/teaching/winter2019/tgi/index" + BASE_URL = "https://i11www.iti.kit.edu" + LINK_RE = re.compile(r"^/_media/teaching/.*?/(tgi-\d+-\d+-)([^/]*\.pdf)$") + + def __init__(self, base_path): + self.base_path = base_path + + self._session = requests.Session() + + def synchronize(self, to_dir, transform=lambda x: x): + pretty.starting_synchronizer(to_dir, "TGI") + + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) + + orga.clean_temp_dir() + + files = self._crawl() + self._download(orga, files, transform) + + orga.clean_sync_dir() + orga.clean_temp_dir() + + def _crawl(self): + url = self.CRAWL_URL + r = self._session.get(url) + + text = r.text + soup = bs4.BeautifulSoup(text, "html.parser") + + files = [] + + for found in soup.find_all("a", href=self.LINK_RE): + url = found["href"] + full_url = self.BASE_URL + url + + filename = re.search(self.LINK_RE, url).group(2) + path = pathlib.PurePath(filename) + + logger.debug(f"Found file {filename} at {full_url}") + files.append((path, full_url)) + + return files + + def _download(self, orga, files, transform): + for path, url in sorted(files): + logger.debug(f"Downloading {path}") + + new_path = transform(path) + if new_path is not None: + temp_file = orga.temp_file() + self._download_file(url, temp_file) + orga.add_file(temp_file, new_path) + + def _download_file(self, url, to_path): + with self._session.get(url, stream=True) as r: + stream_to_path(r, to_path) From 1973c931bdccc27c50a8e4174bbfbc273dd44e74 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 15 Oct 2019 15:37:52 +0200 Subject: [PATCH 2/2] Add support for other years in TGI downloader --- PFERD/tgi.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PFERD/tgi.py b/PFERD/tgi.py index e6c9e09..9661e9c 100644 --- a/PFERD/tgi.py +++ b/PFERD/tgi.py @@ -16,14 +16,15 @@ logger = logging.getLogger(__name__) pretty = PrettyLogger(logger) class TGI: - CRAWL_URL = "https://i11www.iti.kit.edu/teaching/winter2019/tgi/index" + CRAWL_URL = "https://i11www.iti.kit.edu/teaching/{year}/tgi/index" BASE_URL = "https://i11www.iti.kit.edu" LINK_RE = re.compile(r"^/_media/teaching/.*?/(tgi-\d+-\d+-)([^/]*\.pdf)$") - def __init__(self, base_path): + def __init__(self, base_path, year="winter2019"): self.base_path = base_path self._session = requests.Session() + self.year = year def synchronize(self, to_dir, transform=lambda x: x): pretty.starting_synchronizer(to_dir, "TGI") @@ -40,7 +41,7 @@ class TGI: orga.clean_temp_dir() def _crawl(self): - url = self.CRAWL_URL + url = self.CRAWL_URL.replace("{year}", self.year) r = self._session.get(url) text = r.text