Compare commits

...

18 Commits

Author SHA1 Message Date
52852d11a6 Bump version to 1.1.8 2019-09-22 11:56:41 +00:00
f94629a7fa Fix exceptions with weird content types
(hopefully)
2019-09-22 11:55:47 +00:00
c8ee456d33 Bump version to 1.1.7 2019-07-26 08:14:55 +00:00
2752e98621 Fix relative url joining in ti downloader 2019-07-26 10:06:01 +02:00
1572e11da8 Bump version to 1.1.6 2019-07-05 08:49:26 +00:00
ea01dc7cb2 Allow even more types of files 2019-07-05 08:48:43 +00:00
aba8d46d26 Bump version to 1.1.5 2019-07-04 12:17:33 +00:00
77056e6f8d Allow more types of files 2019-07-04 12:16:42 +00:00
064f12c14c Ignore mypy files 2019-07-04 12:16:26 +00:00
2eb834afc3 Bump version to 1.1.4 2019-06-11 12:46:40 +00:00
d468a45662 Allow wolfram files 2019-06-11 12:42:55 +00:00
50e25346e5 Bump version to 1.1.3 2019-06-07 11:36:41 +00:00
67da4e69fa Add colorful log output
Highlight the important operations (new, modified) in different colours.
2019-06-07 13:28:55 +02:00
da602366f8 Bump version to 1.1.2 2019-05-17 07:43:32 +00:00
2016f61bf8 Crawl more of the TI page 2019-05-09 11:04:24 +00:00
59c278da2c Bump version to 1.1.1 2019-05-06 12:07:12 +00:00
c72e92db18 Make Ti downloader authentication more robust 2019-05-06 12:04:01 +00:00
44b4204517 Add basic Ti downloader 2019-05-06 11:54:36 +00:00
11 changed files with 167 additions and 31 deletions

1
.gitignore vendored
View File

@ -10,3 +10,4 @@ pyvenv.cfg
.tmp/ .tmp/
pip-selfcheck.json pip-selfcheck.json
.mypy_cache/

View File

@ -3,6 +3,7 @@ import logging
from .ffm import * from .ffm import *
from .ilias import * from .ilias import *
from .norbert import * from .norbert import *
from .ti import *
from .utils import * from .utils import *
__all__ = ["STYLE", "FORMAT", "DATE_FORMAT", "FORMATTER", "enable_logging"] __all__ = ["STYLE", "FORMAT", "DATE_FORMAT", "FORMATTER", "enable_logging"]
@ -10,6 +11,7 @@ __all__ = ["STYLE", "FORMAT", "DATE_FORMAT", "FORMATTER", "enable_logging"]
__all__ += ffm.__all__ __all__ += ffm.__all__
__all__ += ilias.__all__ __all__ += ilias.__all__
__all__ += norbert.__all__ __all__ += norbert.__all__
__all__ += ti.__all__
__all__ += utils.__all__ __all__ += utils.__all__
STYLE = "{" STYLE = "{"

View File

@ -8,10 +8,11 @@ import bs4
import requests import requests
from .organizer import Organizer from .organizer import Organizer
from .utils import stream_to_path from .utils import stream_to_path, PrettyLogger
__all__ = ["FfM"] __all__ = ["FfM"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pretty = PrettyLogger(logger)
class FfM: class FfM:
BASE_URL = "http://www.math.kit.edu/" BASE_URL = "http://www.math.kit.edu/"
@ -23,7 +24,7 @@ class FfM:
self._session = requests.Session() self._session = requests.Session()
def synchronize(self, urlpart, to_dir, transform=lambda x: x): def synchronize(self, urlpart, to_dir, transform=lambda x: x):
logger.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") pretty.starting_synchronizer(to_dir, "FfM", urlpart)
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)

View File

@ -4,13 +4,13 @@ import logging
import pathlib import pathlib
import re import re
import bs4
from .ilias_authenticators import ShibbolethAuthenticator from .ilias_authenticators import ShibbolethAuthenticator
from .organizer import Organizer from .organizer import Organizer
from .utils import PrettyLogger
__all__ = ["Ilias"] __all__ = ["Ilias"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pretty = PrettyLogger(logger)
class Ilias: class Ilias:
FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")
@ -22,7 +22,7 @@ class Ilias:
self._auth = ShibbolethAuthenticator(base_path / cookie_file) self._auth = ShibbolethAuthenticator(base_path / cookie_file)
def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
logger.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the Ilias synchronizer.") pretty.starting_synchronizer(to_dir, "ILIAS", f"ref_id {ref_id}")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path) orga = Organizer(self.base_path, sync_path)

View File

@ -23,16 +23,6 @@ logger = logging.getLogger(__name__)
class ShibbolethAuthenticator: class ShibbolethAuthenticator:
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
ALLOWED_CONTENT_TYPES = [
"application/pdf",
"application/zip",
"application/msword",
"text/xml",
"text/plain",
"image/jpeg",
"image/png",
]
def __init__(self, cookie_file) -> None: def __init__(self, cookie_file) -> None:
# Because LWPCookieJar insists on the path being str-like instead of # Because LWPCookieJar insists on the path being str-like instead of
# Path-like. # Path-like.
@ -147,11 +137,7 @@ class ShibbolethAuthenticator:
with self._session.get(url, params=params, stream=True) as r: with self._session.get(url, params=params, stream=True) as r:
content_type = r.headers["content-type"] content_type = r.headers["content-type"]
if content_type in self.ALLOWED_CONTENT_TYPES: if content_type.startswith("text/html"):
# Yay, we got the file :)
stream_to_path(r, to_path)
return True
elif content_type == "text/html":
# Dangit, we're probably not logged in. # Dangit, we're probably not logged in.
soup = bs4.BeautifulSoup(r.text, "html.parser") soup = bs4.BeautifulSoup(r.text, "html.parser")
if self._is_logged_in(soup): if self._is_logged_in(soup):
@ -159,9 +145,9 @@ class ShibbolethAuthenticator:
"Attempting to download a web page, not a file") "Attempting to download a web page, not a file")
return False return False
else: else:
# What *did* we get? # Yay, we got the file :)
raise ContentTypeException( stream_to_path(r, to_path)
f"Unknown file of type {content_type}") return True
def download_file(self, file_id, to_path): def download_file(self, file_id, to_path):
params = {"target": file_id} params = {"target": file_id}

View File

@ -9,10 +9,11 @@ import bs4
import requests import requests
from .organizer import Organizer from .organizer import Organizer
from .utils import rename, stream_to_path from .utils import rename, stream_to_path, PrettyLogger
__all__ = ["Norbert"] __all__ = ["Norbert"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pretty = PrettyLogger(logger)
class Norbert: class Norbert:
BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/"
@ -24,7 +25,7 @@ class Norbert:
self._session = requests.Session() self._session = requests.Session()
def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True):
logger.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") pretty.starting_synchronizer(to_dir, "Norbert")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path) orga = Organizer(self.base_path, sync_path)

View File

@ -7,6 +7,7 @@ from . import utils
__all__ = ["Organizer"] __all__ = ["Organizer"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
pretty = utils.PrettyLogger(logger)
class Organizer: class Organizer:
def __init__(self, base_dir, sync_dir): def __init__(self, base_dir, sync_dir):
@ -64,7 +65,7 @@ class Organizer:
if to_path.exists(): if to_path.exists():
if filecmp.cmp(from_path, to_path, shallow=False): if filecmp.cmp(from_path, to_path, shallow=False):
logger.info(f"Ignored {to_path}") pretty.ignored_file(to_path)
# remember path for later reference # remember path for later reference
self._added_files.add(to_path.resolve()) self._added_files.add(to_path.resolve())
@ -73,9 +74,9 @@ class Organizer:
# No further action needed, especially not overwriting symlinks... # No further action needed, especially not overwriting symlinks...
return return
else: else:
logger.info(f"Different file at {to_path}") pretty.modified_file(to_path)
else: else:
logger.info(f"New file at {to_path}") pretty.new_file(to_path)
# copy the file from from_path to sync_dir/to_path # copy the file from from_path to sync_dir/to_path
# If the file being replaced was a symlink, the link itself is overwritten, # If the file being replaced was a symlink, the link itself is overwritten,

112
PFERD/ti.py Normal file
View File

@ -0,0 +1,112 @@
# Fakultät für Mathematik (FfM)
import getpass
import logging
import pathlib
import re
from urllib.parse import urljoin
import bs4
import requests
from .organizer import Organizer
from .utils import stream_to_path, PrettyLogger
__all__ = ["Ti"]
logger = logging.getLogger(__name__)
pretty = PrettyLogger(logger)
class Ti:
BASE_URL = "http://ti.ira.uka.de/"
FILE_RE = re.compile(r"^.+\.pdf$")
def __init__(self, base_path):
self.base_path = base_path
self._session = requests.Session()
self._credentials = None
def synchronize(self, urlpart, to_dir, transform=lambda x: x,
filter=lambda x: True):
pretty.starting_synchronizer(to_dir, "Ti", urlpart)
sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path)
orga.clean_temp_dir()
self._reset_credentials()
available = self._find_available(urlpart)
for name, address in sorted(available.items()):
path = pathlib.PurePath(name)
if filter(path):
self._crawl(urlpart + address, path, orga, transform)
else:
logger.info(f"Skipping {name}/")
orga.clean_sync_dir()
orga.clean_temp_dir()
self._reset_credentials()
def _find_available(self, urlpart):
url = self.BASE_URL + urlpart
r = self._session.get(url)
soup = bs4.BeautifulSoup(r.text, "html.parser")
available = {}
if soup.find(href="./Vorlesung/Vorlesung.php"):
logger.info("Found Folien/")
available["Folien"] = "/Vorlesung/"
if soup.find(href="./Uebungen/Uebungen.php"):
logger.info("Found Blätter/")
available["Blätter"] = "/Uebungen/"
if soup.find(href="./Tutorien/Tutorien.php"):
logger.info("Found Tutorien/")
available["Tutorien"] = "/Tutorien/"
return available
def _crawl(self, urlpart, path, orga, transform):
url = self.BASE_URL + urlpart
r = self._session.get(url)
soup = bs4.BeautifulSoup(r.text, "html.parser")
for filelink in soup.find_all("a", href=self.FILE_RE):
filepath = path / filelink["href"]
fileurl = urljoin(url, filelink["href"])
new_path = transform(filepath)
if new_path is None:
continue
logger.debug(f"Transformed from {filepath} to {new_path}")
temp_path = orga.temp_file()
self._download(fileurl, temp_path)
orga.add_file(temp_path, new_path)
def _get_credentials(self):
if self._credentials is None:
print("Please enter Ti credentials.")
username = getpass.getpass(prompt="Username: ")
password = getpass.getpass(prompt="Password: ")
self._credentials = (username, password)
return self._credentials
def _reset_credentials(self):
self._credentials = None
def _download(self, url, to_path):
while True:
username, password = self._get_credentials()
with self._session.get(url, stream=True, auth=(username, password)) as r:
if r.ok:
stream_to_path(r, to_path)
return
else:
print("Incorrect credentials.")
self._reset_credentials()

View File

@ -1,5 +1,8 @@
import os import os
import sys
import pathlib import pathlib
from colorama import Style
from colorama import Fore
__all__ = [ __all__ = [
"get_base_dir", "get_base_dir",
@ -8,6 +11,7 @@ __all__ = [
"stream_to_path", "stream_to_path",
"ContentTypeException", "ContentTypeException",
"FileNotFoundException", "FileNotFoundException",
"PrettyLogger",
] ]
def get_base_dir(script_file): def get_base_dir(script_file):
@ -26,8 +30,35 @@ def stream_to_path(response, to_path, chunk_size=1024**2):
for chunk in response.iter_content(chunk_size=chunk_size): for chunk in response.iter_content(chunk_size=chunk_size):
fd.write(chunk) fd.write(chunk)
def isOutputPipe():
"""Returns whether this program's output is attached to a pipe.
"""
return sys.stdout.isatty
class ContentTypeException(Exception): class ContentTypeException(Exception):
pass pass
class FileNotFoundException(Exception): class FileNotFoundException(Exception):
pass pass
class PrettyLogger:
def __init__(self, logger):
self.logger = logger
def modified_file(self, file_name):
self.logger.info(f"{Fore.MAGENTA}{Style.BRIGHT}Modified {file_name}.{Style.RESET_ALL}")
def new_file(self, file_name):
self.logger.info(f"{Fore.GREEN}{Style.BRIGHT}Created {file_name}.{Style.RESET_ALL}")
def ignored_file(self, file_name):
self.logger.info(f"{Style.DIM}Ignored {file_name}.{Style.RESET_ALL}")
def starting_synchronizer(self, target_directory, synchronizer_name, subject=None):
subject_str = f"{subject} " if subject else ""
self.logger.info("")
self.logger.info((
f"{Fore.CYAN}{Style.BRIGHT}Synchronizing {subject_str}to {target_directory}"
f" using the {synchronizer_name} synchronizer.{Style.RESET_ALL}"
))

View File

@ -10,7 +10,7 @@ test it though).
To install PFERD or update your installation to the latest version, run this To install PFERD or update your installation to the latest version, run this
wherever you want to install/have installed PFERD: wherever you want to install/have installed PFERD:
``` ```
$ pip install git+https://github.com/Garmelon/PFERD@v1.1.0 $ pip install git+https://github.com/Garmelon/PFERD@v1.1.8
``` ```
The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
@ -25,7 +25,7 @@ $ mkdir Vorlesungen
$ cd Vorlesungen $ cd Vorlesungen
$ python3 -m venv . $ python3 -m venv .
$ . bin/activate $ . bin/activate
$ pip install git+https://github.com/Garmelon/PFERD@v1.1.0 $ pip install git+https://github.com/Garmelon/PFERD@v1.1.8
$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/master/example_config.py $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/master/example_config.py
$ python3 example_config.py $ python3 example_config.py
$ deactivate $ deactivate

View File

@ -2,11 +2,12 @@ from setuptools import setup
setup( setup(
name="PFERD", name="PFERD",
version="1.1.0", version="1.1.8",
packages=["PFERD"], packages=["PFERD"],
install_requires=[ install_requires=[
"requests>=2.21.0", "requests>=2.21.0",
"beautifulsoup4>=4.7.1", "beautifulsoup4>=4.7.1",
"colorama>=0.4.1"
], ],
) )