Switch from tabs to spaces

This commit is contained in:
Joscha 2019-04-24 12:34:20 +00:00
parent c3e64da570
commit 5a1bf2188b
7 changed files with 504 additions and 504 deletions

View File

@ -4,11 +4,11 @@ from .norbert import *
from .utils import * from .utils import *
__all__ = ( __all__ = (
ffm.__all__ + ffm.__all__ +
ilias.__all__ + ilias.__all__ +
norbert.__all__ + norbert.__all__ +
utils.__all__ + utils.__all__ +
[] []
) )
LOG_FORMAT = "[%(levelname)s] %(message)s" LOG_FORMAT = "[%(levelname)s] %(message)s"

View File

@ -11,69 +11,69 @@ from .organizer import Organizer
from . import utils from . import utils
__all__ = [ __all__ = [
"FfM", "FfM",
] ]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class FfM: class FfM:
BASE_URL = "http://www.math.kit.edu/" BASE_URL = "http://www.math.kit.edu/"
LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$") LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$")
RETRY_ATTEMPTS = 5 RETRY_ATTEMPTS = 5
RETRY_DELAY = 1 # seconds RETRY_DELAY = 1 # seconds
def __init__(self, base_path): def __init__(self, base_path):
self.base_path = base_path self.base_path = base_path
self._session = aiohttp.ClientSession() self._session = aiohttp.ClientSession()
async def synchronize(self, urlpart, to_dir, transform=lambda x: x): async def synchronize(self, urlpart, to_dir, transform=lambda x: x):
logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path) orga = Organizer(self.base_path, sync_path)
orga.clean_temp_dir() orga.clean_temp_dir()
await self._crawl(orga, urlpart, transform) await self._crawl(orga, urlpart, transform)
orga.clean_sync_dir() orga.clean_sync_dir()
orga.clean_temp_dir() orga.clean_temp_dir()
async def close(self): async def close(self):
await self._session.close() await self._session.close()
async def _crawl(self, orga, urlpart, transform): async def _crawl(self, orga, urlpart, transform):
url = self.BASE_URL + urlpart url = self.BASE_URL + urlpart
async with self._session.get(url) as resp: async with self._session.get(url) as resp:
text = await resp.text() text = await resp.text()
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
for found in soup.find_all("a", href=self.LINK_RE): for found in soup.find_all("a", href=self.LINK_RE):
url = found["href"] url = found["href"]
filename = re.match(self.LINK_RE, url).group(1).replace("/", ".") filename = re.match(self.LINK_RE, url).group(1).replace("/", ".")
logger.debug(f"Found file {filename} at {url}") logger.debug(f"Found file {filename} at {url}")
old_path = pathlib.PurePath(filename) old_path = pathlib.PurePath(filename)
new_path = transform(old_path) new_path = transform(old_path)
if new_path is None: if new_path is None:
continue continue
logger.debug(f"Transformed from {old_path} to {new_path}") logger.debug(f"Transformed from {old_path} to {new_path}")
temp_path = orga.temp_file() temp_path = orga.temp_file()
await self._download(url, temp_path) await self._download(url, temp_path)
orga.add_file(temp_path, new_path) orga.add_file(temp_path, new_path)
async def _download(self, url, to_path): async def _download(self, url, to_path):
for t in range(self.RETRY_ATTEMPTS): for t in range(self.RETRY_ATTEMPTS):
try: try:
async with self._session.get(url) as resp: async with self._session.get(url) as resp:
await utils.stream_to_path(resp, to_path) await utils.stream_to_path(resp, to_path)
except aiohttp.client_exceptions.ServerDisconnectedError: except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY) await asyncio.sleep(self.RETRY_DELAY)
else: else:
return return
else: else:
logger.error(f"Could not download {url}") logger.error(f"Could not download {url}")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")

View File

@ -12,105 +12,105 @@ from .ilias_authenticators import ShibbolethAuthenticator
from . import utils from . import utils
__all__ = [ __all__ = [
"ILIAS", "ILIAS",
] ]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ILIAS: class ILIAS:
FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")
DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)") DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)")
def __init__(self, base_path, cookie_file): def __init__(self, base_path, cookie_file):
self.base_path = base_path self.base_path = base_path
self._auth = ShibbolethAuthenticator(base_path / cookie_file) self._auth = ShibbolethAuthenticator(base_path / cookie_file)
async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.") logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path) orga = Organizer(self.base_path, sync_path)
orga.clean_temp_dir() orga.clean_temp_dir()
files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
await self._download(orga, files, transform) await self._download(orga, files, transform)
orga.clean_sync_dir() orga.clean_sync_dir()
orga.clean_temp_dir() orga.clean_temp_dir()
async def close(self): async def close(self):
await self._auth.close() await self._auth.close()
async def _crawl(self, dir_path, dir_id, filter_): async def _crawl(self, dir_path, dir_id, filter_):
soup = await self._auth.get_webpage(dir_id) soup = await self._auth.get_webpage(dir_id)
found_files = [] found_files = []
files = self._find_files(soup) files = self._find_files(soup)
for (name, file_id) in files: for (name, file_id) in files:
path = dir_path / name path = dir_path / name
found_files.append((path, file_id)) found_files.append((path, file_id))
logger.debug(f"Found file {path}") logger.debug(f"Found file {path}")
dirs = self._find_dirs(soup) dirs = self._find_dirs(soup)
for (name, ref_id) in dirs: for (name, ref_id) in dirs:
path = dir_path / name path = dir_path / name
logger.debug(f"Found dir {path}") logger.debug(f"Found dir {path}")
if filter_(path): if filter_(path):
logger.info(f"Searching {path}") logger.info(f"Searching {path}")
files = await self._crawl(path, ref_id, filter_) files = await self._crawl(path, ref_id, filter_)
found_files.extend(files) found_files.extend(files)
else: else:
logger.info(f"Not searching {path}") logger.info(f"Not searching {path}")
return found_files return found_files
async def _download(self, orga, files, transform): async def _download(self, orga, files, transform):
for (path, file_id) in sorted(files): for (path, file_id) in sorted(files):
to_path = transform(path) to_path = transform(path)
if to_path is not None: if to_path is not None:
temp_path = orga.temp_file() temp_path = orga.temp_file()
await self._auth.download_file(file_id, temp_path) await self._auth.download_file(file_id, temp_path)
orga.add_file(temp_path, to_path) orga.add_file(temp_path, to_path)
def _find_files(self, soup): def _find_files(self, soup):
files = [] files = []
file_names = set() file_names = set()
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE})
for element in found: for element in found:
file_stem = element.string.strip().replace("/", ".") file_stem = element.string.strip().replace("/", ".")
file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip()
file_id = re.search(self.FILE_RE, element.get("href")).group(1) file_id = re.search(self.FILE_RE, element.get("href")).group(1)
file_name = f"{file_stem}.{file_type}" file_name = f"{file_stem}.{file_type}"
if file_name in file_names: if file_name in file_names:
counter = 1 counter = 1
while True: while True:
file_name = f"{file_stem} (duplicate {counter}).{file_type}" file_name = f"{file_stem} (duplicate {counter}).{file_type}"
if file_name in file_names: if file_name in file_names:
counter += 1 counter += 1
else: else:
break break
files.append((file_name, file_id)) files.append((file_name, file_id))
file_names.add(file_name) file_names.add(file_name)
return files return files
def _find_dirs(self, soup): def _find_dirs(self, soup):
dirs = [] dirs = []
found = soup.find_all("div", {"class": "alert", "role": "alert"}) found = soup.find_all("div", {"class": "alert", "role": "alert"})
if found: if found:
return [] return []
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE})
for element in found: for element in found:
dir_name = element.string.strip().replace("/", ".") dir_name = element.string.strip().replace("/", ".")
ref_id = re.search(self.DIR_RE, element.get("href")).group(1) ref_id = re.search(self.DIR_RE, element.get("href")).group(1)
dir_id = f"fold_{ref_id}" dir_id = f"fold_{ref_id}"
dirs.append((dir_name, dir_id)) dirs.append((dir_name, dir_id))
return dirs return dirs

View File

@ -19,211 +19,211 @@ from .read_write_lock import ReadWriteLock
from . import utils from . import utils
__all__ = [ __all__ = [
"ShibbolethAuthenticator", "ShibbolethAuthenticator",
] ]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ShibbolethAuthenticator: class ShibbolethAuthenticator:
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php" ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
RETRY_ATTEMPTS = 5 RETRY_ATTEMPTS = 5
RETRY_DELAY = 1 # seconds RETRY_DELAY = 1 # seconds
CHUNK_SIZE = 1024**2 CHUNK_SIZE = 1024**2
ALLOWED_CONTENT_TYPES = [ ALLOWED_CONTENT_TYPES = [
"application/pdf", "application/pdf",
"application/zip", "application/zip",
"text/xml", "text/xml",
"text/plain", "text/plain",
"image/jpeg", "image/jpeg",
"image/png", "image/png",
] ]
def __init__(self, cookie_path=None): def __init__(self, cookie_path=None):
self._cookie_path = cookie_path self._cookie_path = cookie_path
# Authentication and file/page download should not happen at the same time. # Authentication and file/page download should not happen at the same time.
# Authenticating counts as writing, file/page downloads as reading. # Authenticating counts as writing, file/page downloads as reading.
self._lock = ReadWriteLock() self._lock = ReadWriteLock()
# Only one self._authenticate() should be started, even if multiple self.get_page()s # Only one self._authenticate() should be started, even if multiple self.get_page()s
# notice they're logged in. # notice they're logged in.
# If self._event is not None, authenticating is currently in progress. # If self._event is not None, authenticating is currently in progress.
self._event = None self._event = None
jar = aiohttp.CookieJar() jar = aiohttp.CookieJar()
if self._cookie_path is not None: if self._cookie_path is not None:
try: try:
jar.load(self._cookie_path) jar.load(self._cookie_path)
except FileNotFoundError: except FileNotFoundError:
pass pass
self._session = aiohttp.ClientSession(cookie_jar=jar) self._session = aiohttp.ClientSession(cookie_jar=jar)
async def close(self): async def close(self):
await self._session.close() await self._session.close()
async def _post(self, url, params=None, data=None): async def _post(self, url, params=None, data=None):
for t in range(self.RETRY_ATTEMPTS): for t in range(self.RETRY_ATTEMPTS):
try: try:
async with self._session.post(url, params=params, data=data) as resp: async with self._session.post(url, params=params, data=data) as resp:
text = await resp.text() text = await resp.text()
return resp.url, text return resp.url, text
except aiohttp.client_exceptions.ServerDisconnectedError: except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY) await asyncio.sleep(self.RETRY_DELAY)
logger.error(f"Could not POST {url} params:{params} data:{data}.") logger.error(f"Could not POST {url} params:{params} data:{data}.")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
async def _get(self, url, params=None): async def _get(self, url, params=None):
for t in range(self.RETRY_ATTEMPTS): for t in range(self.RETRY_ATTEMPTS):
try: try:
async with self._session.get(url, params=params) as resp: async with self._session.get(url, params=params) as resp:
text = await resp.text() text = await resp.text()
return resp.url, text return resp.url, text
except aiohttp.client_exceptions.ServerDisconnectedError: except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY) await asyncio.sleep(self.RETRY_DELAY)
logger.error(f"Could not GET {url} params:{params}.") logger.error(f"Could not GET {url} params:{params}.")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
def _login_successful(self, soup): def _login_successful(self, soup):
saml_response = soup.find("input", {"name": "SAMLResponse"}) saml_response = soup.find("input", {"name": "SAMLResponse"})
relay_state = soup.find("input", {"name": "RelayState"}) relay_state = soup.find("input", {"name": "RelayState"})
return saml_response is not None and relay_state is not None return saml_response is not None and relay_state is not None
def _save_cookies(self): def _save_cookies(self):
logger.info(f"Saving cookies to {self._cookie_path}") logger.info(f"Saving cookies to {self._cookie_path}")
if self._cookie_path is not None: if self._cookie_path is not None:
self._session.cookie_jar.save(self._cookie_path) self._session.cookie_jar.save(self._cookie_path)
# WARNING: Only use self._ensure_authenticated() to authenticate, # WARNING: Only use self._ensure_authenticated() to authenticate,
# don't call self._authenticate() itself. # don't call self._authenticate() itself.
async def _authenticate(self): async def _authenticate(self):
async with self._lock.write(): async with self._lock.write():
# Equivalent: Click on "Mit KIT-Account anmelden" button in # Equivalent: Click on "Mit KIT-Account anmelden" button in
# https://ilias.studium.kit.edu/login.php # https://ilias.studium.kit.edu/login.php
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
data = { data = {
"sendLogin": "1", "sendLogin": "1",
"idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
"target": "/shib_login.php", "target": "/shib_login.php",
"home_organization_selection": "Mit KIT-Account anmelden", "home_organization_selection": "Mit KIT-Account anmelden",
} }
logger.debug("Begin authentication process with ILIAS") logger.debug("Begin authentication process with ILIAS")
url, text = await self._post(url, data=data) url, text = await self._post(url, data=data)
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
# Attempt to login using credentials, if necessary # Attempt to login using credentials, if necessary
while not self._login_successful(soup): while not self._login_successful(soup):
form = soup.find("form", {"class": "form2", "method": "post"}) form = soup.find("form", {"class": "form2", "method": "post"})
action = form["action"] action = form["action"]
print("Please enter Shibboleth credentials.") print("Please enter Shibboleth credentials.")
username = getpass.getpass(prompt="Username: ") username = getpass.getpass(prompt="Username: ")
password = getpass.getpass(prompt="Password: ") password = getpass.getpass(prompt="Password: ")
# Equivalent: Enter credentials in # Equivalent: Enter credentials in
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
url = "https://idp.scc.kit.edu" + action url = "https://idp.scc.kit.edu" + action
data = { data = {
"_eventId_proceed": "", "_eventId_proceed": "",
"j_username": username, "j_username": username,
"j_password": password, "j_password": password,
} }
logger.debug("Attempt to log in to Shibboleth using credentials") logger.debug("Attempt to log in to Shibboleth using credentials")
url, text = await self._post(url, data=data) url, text = await self._post(url, data=data)
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
if not self._login_successful(soup): if not self._login_successful(soup):
print("Incorrect credentials.") print("Incorrect credentials.")
# Saving progress: Successfully authenticated with Shibboleth # Saving progress: Successfully authenticated with Shibboleth
self._save_cookies() self._save_cookies()
relay_state = soup.find("input", {"name": "RelayState"})["value"] relay_state = soup.find("input", {"name": "RelayState"})["value"]
saml_response = soup.find("input", {"name": "SAMLResponse"})["value"] saml_response = soup.find("input", {"name": "SAMLResponse"})["value"]
# Equivalent: Being redirected via JS automatically # Equivalent: Being redirected via JS automatically
# (or clicking "Continue" if you have JS disabled) # (or clicking "Continue" if you have JS disabled)
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
data = { data = {
"RelayState": relay_state, "RelayState": relay_state,
"SAMLResponse": saml_response, "SAMLResponse": saml_response,
} }
logger.debug("Redirect back to ILIAS with login information") logger.debug("Redirect back to ILIAS with login information")
url, text = await self._post(url, data=data) url, text = await self._post(url, data=data)
# Saving progress: Successfully authenticated with Ilias # Saving progress: Successfully authenticated with Ilias
self._save_cookies() self._save_cookies()
async def _ensure_authenticated(self): async def _ensure_authenticated(self):
if self._event is None: if self._event is None:
self._event = asyncio.Event() self._event = asyncio.Event()
logger.info("Not logged in, authentication required.") logger.info("Not logged in, authentication required.")
await self._authenticate() await self._authenticate()
self._event.set() self._event.set()
self._event = None self._event = None
else: else:
await self._event.wait() await self._event.wait()
def _is_logged_in(self, soup): def _is_logged_in(self, soup):
userlog = soup.find("li", {"id": "userlog"}) userlog = soup.find("li", {"id": "userlog"})
return userlog is not None return userlog is not None
async def get_webpage_refid(self, ref_id): async def get_webpage_refid(self, ref_id):
return await self.get_webpage(f"fold_{ref_id}") return await self.get_webpage(f"fold_{ref_id}")
async def get_webpage(self, object_id): async def get_webpage(self, object_id):
params = {"target": object_id} params = {"target": object_id}
while True: while True:
async with self._lock.read(): async with self._lock.read():
logger.debug(f"Getting {self.ILIAS_GOTO} {params}") logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
_, text = await self._get(self.ILIAS_GOTO, params=params) _, text = await self._get(self.ILIAS_GOTO, params=params)
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
if self._is_logged_in(soup): if self._is_logged_in(soup):
return soup return soup
else: else:
await self._ensure_authenticated() await self._ensure_authenticated()
async def _download(self, url, params, to_path): async def _download(self, url, params, to_path):
for t in range(self.RETRY_ATTEMPTS): for t in range(self.RETRY_ATTEMPTS):
try: try:
async with self._session.get(url, params=params) as resp: async with self._session.get(url, params=params) as resp:
if resp.content_type in self.ALLOWED_CONTENT_TYPES: if resp.content_type in self.ALLOWED_CONTENT_TYPES:
# Yay, we got the file (as long as it's a PDF) # Yay, we got the file (as long as it's a PDF)
await utils.stream_to_path(resp, to_path) await utils.stream_to_path(resp, to_path)
return True return True
elif resp.content_type == "text/html": elif resp.content_type == "text/html":
# Dangit, we're probably not logged in. # Dangit, we're probably not logged in.
text = await resp.text() text = await resp.text()
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
if self._is_logged_in(soup): if self._is_logged_in(soup):
raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).") raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).")
return False return False
else: else:
# What *did* we get? # What *did* we get?
raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.") raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.")
except aiohttp.client_exceptions.ServerDisconnectedError: except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY) await asyncio.sleep(self.RETRY_DELAY)
logger.error(f"Could not download {url} params:{params}.") logger.error(f"Could not download {url} params:{params}.")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
async def download_file(self, file_id, to_path): async def download_file(self, file_id, to_path):
params = {"target": file_id} params = {"target": file_id}
while True: while True:
async with self._lock.read(): async with self._lock.read():
success = await self._download(self.ILIAS_GOTO, params, to_path) success = await self._download(self.ILIAS_GOTO, params, to_path)
if success: if success:
return return
else: else:
await self._ensure_authenticated() await self._ensure_authenticated()

View File

@ -12,113 +12,113 @@ from .organizer import Organizer
from . import utils from . import utils
__all__ = [ __all__ = [
"Norbert", "Norbert",
] ]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Norbert: class Norbert:
BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/"
LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$")
RETRY_ATTEMPTS = 5 RETRY_ATTEMPTS = 5
RETRY_DELAY = 1 # seconds RETRY_DELAY = 1 # seconds
def __init__(self, base_path): def __init__(self, base_path):
self.base_path = base_path self.base_path = base_path
self._session = aiohttp.ClientSession() self._session = aiohttp.ClientSession()
async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True):
logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.") logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir) sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path) orga = Organizer(self.base_path, sync_path)
orga.clean_temp_dir() orga.clean_temp_dir()
files = await self._crawl() files = await self._crawl()
await self._download(orga, files, transform, unzip) await self._download(orga, files, transform, unzip)
orga.clean_sync_dir() orga.clean_sync_dir()
orga.clean_temp_dir() orga.clean_temp_dir()
async def close(self): async def close(self):
await self._session.close() await self._session.close()
async def _crawl(self): async def _crawl(self):
url = self.BASE_URL url = self.BASE_URL
async with self._session.get(url) as resp: async with self._session.get(url) as resp:
raw = await resp.read() raw = await resp.read()
# replace undecodeable characters with a placeholder # replace undecodeable characters with a placeholder
text = raw.decode("utf-8", "replace") text = raw.decode("utf-8", "replace")
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")
files = [] files = []
for found in soup.find_all("a", href=self.LINK_RE): for found in soup.find_all("a", href=self.LINK_RE):
url = found["href"] url = found["href"]
full_url = self.BASE_URL + url full_url = self.BASE_URL + url
filename = re.search(self.LINK_RE, url).group(1) filename = re.search(self.LINK_RE, url).group(1)
path = pathlib.PurePath(filename) path = pathlib.PurePath(filename)
logger.debug(f"Found zip file {filename} at {full_url}") logger.debug(f"Found zip file {filename} at {full_url}")
files.append((path, full_url)) files.append((path, full_url))
return files return files
async def _download(self, orga, files, transform, unzip): async def _download(self, orga, files, transform, unzip):
for path, url in sorted(files): for path, url in sorted(files):
# Yes, we want the zip file contents # Yes, we want the zip file contents
if unzip(path): if unzip(path):
logger.debug(f"Downloading and unzipping {path}") logger.debug(f"Downloading and unzipping {path}")
zip_path = utils.rename(path, path.stem) zip_path = utils.rename(path, path.stem)
# Download zip file # Download zip file
temp_file = orga.temp_file() temp_file = orga.temp_file()
await self._download_zip(url, temp_file) await self._download_zip(url, temp_file)
# Search the zip file for files to extract # Search the zip file for files to extract
temp_dir = orga.temp_dir() temp_dir = orga.temp_dir()
with zipfile.ZipFile(temp_file, "r") as zf: with zipfile.ZipFile(temp_file, "r") as zf:
for info in zf.infolist(): for info in zf.infolist():
# Only interested in the files themselves, the directory # Only interested in the files themselves, the directory
# structure is created automatically by orga.add_file() # structure is created automatically by orga.add_file()
if info.is_dir(): if info.is_dir():
continue continue
file_path = zip_path / pathlib.PurePath(info.filename) file_path = zip_path / pathlib.PurePath(info.filename)
logger.debug(f"Found {info.filename} at path {file_path}") logger.debug(f"Found {info.filename} at path {file_path}")
new_path = transform(file_path) new_path = transform(file_path)
if new_path is not None: if new_path is not None:
# Extract to temp file and add, the usual deal # Extract to temp file and add, the usual deal
temp_file = orga.temp_file() temp_file = orga.temp_file()
extracted_path = zf.extract(info, temp_dir) extracted_path = zf.extract(info, temp_dir)
extracted_path = pathlib.Path(extracted_path) extracted_path = pathlib.Path(extracted_path)
orga.add_file(extracted_path, new_path) orga.add_file(extracted_path, new_path)
# No, we only want the zip file itself # No, we only want the zip file itself
else: else:
logger.debug(f"Only downloading {path}") logger.debug(f"Only downloading {path}")
new_path = transform(path) new_path = transform(path)
if new_path is not None: if new_path is not None:
temp_file = orga.temp_file() temp_file = orga.temp_file()
await self._download_zip(url, temp_file) await self._download_zip(url, temp_file)
orga.add_file(temp_file, new_path) orga.add_file(temp_file, new_path)
async def _download_zip(self, url, to_path): async def _download_zip(self, url, to_path):
for t in range(self.RETRY_ATTEMPTS): for t in range(self.RETRY_ATTEMPTS):
try: try:
async with self._session.get(url) as resp: async with self._session.get(url) as resp:
await utils.stream_to_path(resp, to_path) await utils.stream_to_path(resp, to_path)
except aiohttp.client_exceptions.ServerDisconnectedError: except aiohttp.client_exceptions.ServerDisconnectedError:
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
await asyncio.sleep(self.RETRY_DELAY) await asyncio.sleep(self.RETRY_DELAY)
else: else:
return return
else: else:
logger.error(f"Could not download {url}") logger.error(f"Could not download {url}")
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")

View File

@ -6,135 +6,135 @@ import shutil
from . import utils from . import utils
__all__ = [ __all__ = [
"Organizer", "Organizer",
] ]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Organizer: class Organizer:
def __init__(self, base_dir, sync_dir): def __init__(self, base_dir, sync_dir):
""" """
base_dir - the .tmp directory will be created here base_dir - the .tmp directory will be created here
sync_dir - synced files will be moved here sync_dir - synced files will be moved here
Both are expected to be concrete pathlib paths. Both are expected to be concrete pathlib paths.
""" """
self._base_dir = base_dir self._base_dir = base_dir
self._sync_dir = sync_dir self._sync_dir = sync_dir
self._temp_dir = pathlib.Path(self._base_dir, ".tmp") self._temp_dir = pathlib.Path(self._base_dir, ".tmp")
self._temp_nr = 0 self._temp_nr = 0
# check if base/sync dir exist? # check if base/sync dir exist?
self._added_files = set() self._added_files = set()
def clean_temp_dir(self): def clean_temp_dir(self):
if self._temp_dir.exists(): if self._temp_dir.exists():
shutil.rmtree(self._temp_dir) shutil.rmtree(self._temp_dir)
self._temp_dir.mkdir(exist_ok=True) self._temp_dir.mkdir(exist_ok=True)
logger.debug(f"Cleaned temp dir: {self._temp_dir}") logger.debug(f"Cleaned temp dir: {self._temp_dir}")
def temp_dir(self): def temp_dir(self):
nr = self._temp_nr nr = self._temp_nr
self._temp_nr += 1 self._temp_nr += 1
temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve()
logger.debug(f"Produced new temp dir: {temp_dir}") logger.debug(f"Produced new temp dir: {temp_dir}")
return temp_dir return temp_dir
def temp_file(self): def temp_file(self):
# generate the path to a new temp file in base_path/.tmp/ # generate the path to a new temp file in base_path/.tmp/
# make sure no two paths are the same # make sure no two paths are the same
nr = self._temp_nr nr = self._temp_nr
self._temp_nr += 1 self._temp_nr += 1
temp_file = pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve() temp_file = pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve()
logger.debug(f"Produced new temp file: {temp_file}") logger.debug(f"Produced new temp file: {temp_file}")
return temp_file return temp_file
def add_file(self, from_path, to_path): def add_file(self, from_path, to_path):
if not from_path.exists(): if not from_path.exists():
raise utils.FileNotFoundException(f"Could not add file at {from_path}") raise utils.FileNotFoundException(f"Could not add file at {from_path}")
# check if sync_dir/to_path is inside sync_dir? # check if sync_dir/to_path is inside sync_dir?
to_path = pathlib.Path(self._sync_dir, to_path) to_path = pathlib.Path(self._sync_dir, to_path)
if to_path.exists() and to_path.is_dir(): if to_path.exists() and to_path.is_dir():
if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False):
shutil.rmtree(to_path) shutil.rmtree(to_path)
else: else:
logger.warn(f"Could not add file {to_path}") logger.warn(f"Could not add file {to_path}")
return return
if to_path.exists(): if to_path.exists():
if filecmp.cmp(from_path, to_path, shallow=False): if filecmp.cmp(from_path, to_path, shallow=False):
logger.info(f"Ignored {to_path}") logger.info(f"Ignored {to_path}")
# remember path for later reference # remember path for later reference
self._added_files.add(to_path.resolve()) self._added_files.add(to_path.resolve())
logger.debug(f"Added file {to_path.resolve()}") logger.debug(f"Added file {to_path.resolve()}")
# No further action needed, especially not overwriting symlinks... # No further action needed, especially not overwriting symlinks...
return return
else: else:
logger.info(f"Different file at {to_path}") logger.info(f"Different file at {to_path}")
else: else:
logger.info(f"New file at {to_path}") logger.info(f"New file at {to_path}")
# copy the file from from_path to sync_dir/to_path # copy the file from from_path to sync_dir/to_path
# If the file being replaced was a symlink, the link itself is overwritten, # If the file being replaced was a symlink, the link itself is overwritten,
# not the file the link points to. # not the file the link points to.
to_path.parent.mkdir(parents=True, exist_ok=True) to_path.parent.mkdir(parents=True, exist_ok=True)
from_path.replace(to_path) from_path.replace(to_path)
logger.debug(f"Moved {from_path} to {to_path}") logger.debug(f"Moved {from_path} to {to_path}")
# remember path for later reference, after the new file was written # remember path for later reference, after the new file was written
# This is necessary here because otherwise, resolve() would resolve the symlink too. # This is necessary here because otherwise, resolve() would resolve the symlink too.
self._added_files.add(to_path.resolve()) self._added_files.add(to_path.resolve())
logger.debug(f"Added file {to_path.resolve()}") logger.debug(f"Added file {to_path.resolve()}")
def clean_sync_dir(self): def clean_sync_dir(self):
self._clean_dir(self._sync_dir, remove_parent=False) self._clean_dir(self._sync_dir, remove_parent=False)
logger.debug(f"Cleaned sync dir: {self._sync_dir}") logger.debug(f"Cleaned sync dir: {self._sync_dir}")
def _clean_dir(self, path, remove_parent=True): def _clean_dir(self, path, remove_parent=True):
for child in sorted(path.iterdir()): for child in sorted(path.iterdir()):
logger.debug(f"Looking at {child.resolve()}") logger.debug(f"Looking at {child.resolve()}")
if child.is_dir(): if child.is_dir():
self._clean_dir(child, remove_parent=True) self._clean_dir(child, remove_parent=True)
elif child.resolve() not in self._added_files: elif child.resolve() not in self._added_files:
if self._prompt_yes_no(f"Delete {child}?", default=False): if self._prompt_yes_no(f"Delete {child}?", default=False):
child.unlink() child.unlink()
logger.debug(f"Deleted {child}") logger.debug(f"Deleted {child}")
if remove_parent: if remove_parent:
try: try:
path.rmdir() path.rmdir()
except OSError: # directory not empty except OSError: # directory not empty
pass pass
def _prompt_yes_no(self, question, default=None): def _prompt_yes_no(self, question, default=None):
if default is True: if default is True:
prompt = "[Y/n]" prompt = "[Y/n]"
elif default is False: elif default is False:
prompt = "[y/N]" prompt = "[y/N]"
else: else:
prompt = "[y/n]" prompt = "[y/n]"
text = f"{question} {prompt} " text = f"{question} {prompt} "
WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'." WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'."
while True: while True:
response = input(text).strip().lower() response = input(text).strip().lower()
if response in {"yes", "ye", "y"}: if response in {"yes", "ye", "y"}:
return True return True
elif response in {"no", "n"}: elif response in {"no", "n"}:
return False return False
elif response == "": elif response == "":
if default is None: if default is None:
print(WRONG_REPLY) print(WRONG_REPLY)
else: else:
return default return default
else: else:
print(WRONG_REPLY) print(WRONG_REPLY)
# How to use: # How to use:
# #

View File

@ -2,39 +2,39 @@ import os
import pathlib import pathlib
__all__ = [ __all__ = [
"get_base_dir", "get_base_dir",
"move", "move",
"rename", "rename",
"stream_to_path", "stream_to_path",
"OutOfTriesException", "OutOfTriesException",
"UnknownFileTypeException", "UnknownFileTypeException",
"FileNotFoundException", "FileNotFoundException",
] ]
def get_base_dir(script_file): def get_base_dir(script_file):
return pathlib.Path(os.path.dirname(os.path.abspath(script_file))) return pathlib.Path(os.path.dirname(os.path.abspath(script_file)))
def move(path, from_folders, to_folders): def move(path, from_folders, to_folders):
l = len(from_folders) l = len(from_folders)
if path.parts[:l] == from_folders: if path.parts[:l] == from_folders:
return pathlib.PurePath(*to_folders, *path.parts[l:]) return pathlib.PurePath(*to_folders, *path.parts[l:])
def rename(path, to_name): def rename(path, to_name):
return pathlib.PurePath(*path.parts[:-1], to_name) return pathlib.PurePath(*path.parts[:-1], to_name)
async def stream_to_path(resp, to_path, chunk_size=1024**2): async def stream_to_path(resp, to_path, chunk_size=1024**2):
with open(to_path, 'wb') as fd: with open(to_path, 'wb') as fd:
while True: while True:
chunk = await resp.content.read(chunk_size) chunk = await resp.content.read(chunk_size)
if not chunk: if not chunk:
break break
fd.write(chunk) fd.write(chunk)
class OutOfTriesException(Exception): class OutOfTriesException(Exception):
pass pass
class UnknownFileTypeException(Exception): class UnknownFileTypeException(Exception):
pass pass
class FileNotFoundException(Exception): class FileNotFoundException(Exception):
pass pass