pferd/PFERD/ti.py

# Fakultät für Mathematik (FfM)

import getpass
import logging
import pathlib
import re
from urllib.parse import urljoin

import bs4
import requests

from .organizer import Organizer
from .utils import stream_to_path, PrettyLogger

__all__ = ["Ti"]
logger = logging.getLogger(__name__)
pretty = PrettyLogger(logger)

class Ti:
    BASE_URL = "http://ti.ira.uka.de/"
    FILE_RE = re.compile(r"^.+\.pdf$")

    def __init__(self, base_path):
        self.base_path = base_path

        self._session = requests.Session()
        self._credentials = None

    def synchronize(self, urlpart, to_dir, transform=lambda x: x,
            filter=lambda x: True):
        pretty.starting_synchronizer(to_dir, "Ti", urlpart)

        sync_path = pathlib.Path(self.base_path, to_dir)

        orga = Organizer(self.base_path, sync_path)
        orga.clean_temp_dir()

        self._reset_credentials()

        available = self._find_available(urlpart)

        for name, address in sorted(available.items()):
            path = pathlib.PurePath(name)
            if filter(path):
                self._crawl(urlpart + address, path, orga, transform)
            else:
                logger.info(f"Skipping {name}/")

        orga.clean_sync_dir()
        orga.clean_temp_dir()

        self._reset_credentials()

    def _find_available(self, urlpart):
        url = self.BASE_URL + urlpart
        r = self._session.get(url)
        soup = bs4.BeautifulSoup(r.text, "html.parser")

        available = {}

        if soup.find(href="./Vorlesung/Vorlesung.php"):
            logger.info("Found Folien/")
            available["Folien"] = "/Vorlesung/"
        if soup.find(href="./Uebungen/Uebungen.php"):
            logger.info("Found Blätter/")
            available["Blätter"] = "/Uebungen/"
        if soup.find(href="./Tutorien/Tutorien.php"):
            logger.info("Found Tutorien/")
            available["Tutorien"] = "/Tutorien/"
        if soup.find(href="./AlteKlausuren/AlteKlausuren.php"):
            logger.info("Found AlteKlausuren/")
            available["AltKlausuren"] = "/AlteKlausuren/"

        return available

    def _crawl(self, urlpart, path, orga, transform):
        url = self.BASE_URL + urlpart
        r = self._session.get(url)
        soup = bs4.BeautifulSoup(r.text, "html.parser")

        for filelink in soup.find_all("a", href=self.FILE_RE):
            filepath = path / filelink["href"]
            fileurl = urljoin(url, filelink["href"])

            new_path = transform(filepath)
            if new_path is None:
                continue
            logger.debug(f"Transformed from {filepath} to {new_path}")

            temp_path = orga.temp_file()
            self._download(fileurl, temp_path)
            orga.add_file(temp_path, new_path)


    def _get_credentials(self):
        if self._credentials is None:
            print("Please enter Ti credentials.")
            username = getpass.getpass(prompt="Username: ")
            password = getpass.getpass(prompt="Password: ")
            self._credentials = (username, password)
        return self._credentials

    def _reset_credentials(self):
        self._credentials = None

    def _download(self, url, to_path):
        while True:
            username, password = self._get_credentials()
            with self._session.get(url, stream=True, auth=(username, password)) as r:
                if r.ok:
                    stream_to_path(r, to_path)
                    return
                else:
                    print("Incorrect credentials.")
                    self._reset_credentials()
Add basic Ti downloader 2019-05-06 13:54:36 +02:00			`# Fakultät für Mathematik (FfM)`

			`import getpass`
			`import logging`
			`import pathlib`
			`import re`
Fix relative url joining in ti downloader 2019-07-26 10:06:01 +02:00			`from urllib.parse import urljoin`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`import bs4`
			`import requests`

			`from .organizer import Organizer`
Add colorful log output Highlight the important operations (new, modified) in different colours. 2019-06-07 13:26:23 +02:00			`from .utils import stream_to_path, PrettyLogger`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`__all__ = ["Ti"]`
			`logger = logging.getLogger(__name__)`
Add colorful log output Highlight the important operations (new, modified) in different colours. 2019-06-07 13:26:23 +02:00			`pretty = PrettyLogger(logger)`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`class Ti:`
			`BASE_URL = "http://ti.ira.uka.de/"`
			`FILE_RE = re.compile(r"^.+\.pdf$")`

			`def __init__(self, base_path):`
			`self.base_path = base_path`

			`self._session = requests.Session()`
			`self._credentials = None`

			`def synchronize(self, urlpart, to_dir, transform=lambda x: x,`
			`filter=lambda x: True):`
Add colorful log output Highlight the important operations (new, modified) in different colours. 2019-06-07 13:26:23 +02:00			`pretty.starting_synchronizer(to_dir, "Ti", urlpart)`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`sync_path = pathlib.Path(self.base_path, to_dir)`

			`orga = Organizer(self.base_path, sync_path)`
			`orga.clean_temp_dir()`

			`self._reset_credentials()`

			`available = self._find_available(urlpart)`

Crawl more of the TI page 2019-05-09 13:04:24 +02:00			`for name, address in sorted(available.items()):`
			`path = pathlib.PurePath(name)`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00			`if filter(path):`
Crawl more of the TI page 2019-05-09 13:04:24 +02:00			`self._crawl(urlpart + address, path, orga, transform)`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00			`else:`
Fix relative url joining in ti downloader 2019-07-26 10:06:01 +02:00			`logger.info(f"Skipping {name}/")`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`orga.clean_sync_dir()`
			`orga.clean_temp_dir()`

			`self._reset_credentials()`

			`def _find_available(self, urlpart):`
			`url = self.BASE_URL + urlpart`
			`r = self._session.get(url)`
			`soup = bs4.BeautifulSoup(r.text, "html.parser")`

			`available = {}`

			`if soup.find(href="./Vorlesung/Vorlesung.php"):`
			`logger.info("Found Folien/")`
			`available["Folien"] = "/Vorlesung/"`
			`if soup.find(href="./Uebungen/Uebungen.php"):`
			`logger.info("Found Blätter/")`
			`available["Blätter"] = "/Uebungen/"`
Crawl more of the TI page 2019-05-09 13:04:24 +02:00			`if soup.find(href="./Tutorien/Tutorien.php"):`
			`logger.info("Found Tutorien/")`
			`available["Tutorien"] = "/Tutorien/"`
Hack in support for TI exams This just adds an additional crawl check for AlteKlausuren. This is not present on the root site but at the suffix `/Klausuren`. Example config: ```py # The "Klausur" needs to be copied verbatim! ti.synchronize("Klausur", "sync dir name", transform=ro_19_klausur_transform, filter=ro_19_klausur_filter) ``` 2020-02-24 20:56:43 +01:00			`if soup.find(href="./AlteKlausuren/AlteKlausuren.php"):`
			`logger.info("Found AlteKlausuren/")`
			`available["AltKlausuren"] = "/AlteKlausuren/"`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`return available`

			`def _crawl(self, urlpart, path, orga, transform):`
			`url = self.BASE_URL + urlpart`
			`r = self._session.get(url)`
			`soup = bs4.BeautifulSoup(r.text, "html.parser")`

			`for filelink in soup.find_all("a", href=self.FILE_RE):`
			`filepath = path / filelink["href"]`
Fix relative url joining in ti downloader 2019-07-26 10:06:01 +02:00			`fileurl = urljoin(url, filelink["href"])`
Add basic Ti downloader 2019-05-06 13:54:36 +02:00
			`new_path = transform(filepath)`
			`if new_path is None:`
			`continue`
			`logger.debug(f"Transformed from {filepath} to {new_path}")`

			`temp_path = orga.temp_file()`
			`self._download(fileurl, temp_path)`
			`orga.add_file(temp_path, new_path)`


			`def _get_credentials(self):`
			`if self._credentials is None:`
			`print("Please enter Ti credentials.")`
			`username = getpass.getpass(prompt="Username: ")`
			`password = getpass.getpass(prompt="Password: ")`
			`self._credentials = (username, password)`
			`return self._credentials`

			`def _reset_credentials(self):`
			`self._credentials = None`

			`def _download(self, url, to_path):`
Make Ti downloader authentication more robust 2019-05-06 14:04:01 +02:00			`while True:`
			`username, password = self._get_credentials()`
			`with self._session.get(url, stream=True, auth=(username, password)) as r:`
			`if r.ok:`
			`stream_to_path(r, to_path)`
			`return`
			`else:`
			`print("Incorrect credentials.")`
			`self._reset_credentials()`