mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Switch from tabs to spaces
This commit is contained in:
parent
c3e64da570
commit
5a1bf2188b
@ -4,11 +4,11 @@ from .norbert import *
|
|||||||
from .utils import *
|
from .utils import *
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
ffm.__all__ +
|
ffm.__all__ +
|
||||||
ilias.__all__ +
|
ilias.__all__ +
|
||||||
norbert.__all__ +
|
norbert.__all__ +
|
||||||
utils.__all__ +
|
utils.__all__ +
|
||||||
[]
|
[]
|
||||||
)
|
)
|
||||||
|
|
||||||
LOG_FORMAT = "[%(levelname)s] %(message)s"
|
LOG_FORMAT = "[%(levelname)s] %(message)s"
|
||||||
|
96
PFERD/ffm.py
96
PFERD/ffm.py
@ -11,69 +11,69 @@ from .organizer import Organizer
|
|||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"FfM",
|
"FfM",
|
||||||
]
|
]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class FfM:
|
class FfM:
|
||||||
BASE_URL = "http://www.math.kit.edu/"
|
BASE_URL = "http://www.math.kit.edu/"
|
||||||
LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$")
|
LINK_RE = re.compile(r"^https?://www.math.kit.edu/.*/(.*\.pdf)$")
|
||||||
|
|
||||||
RETRY_ATTEMPTS = 5
|
RETRY_ATTEMPTS = 5
|
||||||
RETRY_DELAY = 1 # seconds
|
RETRY_DELAY = 1 # seconds
|
||||||
|
|
||||||
def __init__(self, base_path):
|
def __init__(self, base_path):
|
||||||
self.base_path = base_path
|
self.base_path = base_path
|
||||||
|
|
||||||
self._session = aiohttp.ClientSession()
|
self._session = aiohttp.ClientSession()
|
||||||
|
|
||||||
async def synchronize(self, urlpart, to_dir, transform=lambda x: x):
|
async def synchronize(self, urlpart, to_dir, transform=lambda x: x):
|
||||||
logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.")
|
logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.")
|
||||||
|
|
||||||
sync_path = pathlib.Path(self.base_path, to_dir)
|
sync_path = pathlib.Path(self.base_path, to_dir)
|
||||||
orga = Organizer(self.base_path, sync_path)
|
orga = Organizer(self.base_path, sync_path)
|
||||||
|
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
await self._crawl(orga, urlpart, transform)
|
await self._crawl(orga, urlpart, transform)
|
||||||
|
|
||||||
orga.clean_sync_dir()
|
orga.clean_sync_dir()
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._session.close()
|
await self._session.close()
|
||||||
|
|
||||||
async def _crawl(self, orga, urlpart, transform):
|
async def _crawl(self, orga, urlpart, transform):
|
||||||
url = self.BASE_URL + urlpart
|
url = self.BASE_URL + urlpart
|
||||||
async with self._session.get(url) as resp:
|
async with self._session.get(url) as resp:
|
||||||
text = await resp.text()
|
text = await resp.text()
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
|
||||||
for found in soup.find_all("a", href=self.LINK_RE):
|
for found in soup.find_all("a", href=self.LINK_RE):
|
||||||
url = found["href"]
|
url = found["href"]
|
||||||
filename = re.match(self.LINK_RE, url).group(1).replace("/", ".")
|
filename = re.match(self.LINK_RE, url).group(1).replace("/", ".")
|
||||||
logger.debug(f"Found file {filename} at {url}")
|
logger.debug(f"Found file {filename} at {url}")
|
||||||
|
|
||||||
old_path = pathlib.PurePath(filename)
|
old_path = pathlib.PurePath(filename)
|
||||||
new_path = transform(old_path)
|
new_path = transform(old_path)
|
||||||
if new_path is None:
|
if new_path is None:
|
||||||
continue
|
continue
|
||||||
logger.debug(f"Transformed from {old_path} to {new_path}")
|
logger.debug(f"Transformed from {old_path} to {new_path}")
|
||||||
|
|
||||||
temp_path = orga.temp_file()
|
temp_path = orga.temp_file()
|
||||||
await self._download(url, temp_path)
|
await self._download(url, temp_path)
|
||||||
orga.add_file(temp_path, new_path)
|
orga.add_file(temp_path, new_path)
|
||||||
|
|
||||||
async def _download(self, url, to_path):
|
async def _download(self, url, to_path):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
for t in range(self.RETRY_ATTEMPTS):
|
||||||
try:
|
try:
|
||||||
async with self._session.get(url) as resp:
|
async with self._session.get(url) as resp:
|
||||||
await utils.stream_to_path(resp, to_path)
|
await utils.stream_to_path(resp, to_path)
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
await asyncio.sleep(self.RETRY_DELAY)
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
logger.error(f"Could not download {url}")
|
logger.error(f"Could not download {url}")
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
||||||
|
150
PFERD/ilias.py
150
PFERD/ilias.py
@ -12,105 +12,105 @@ from .ilias_authenticators import ShibbolethAuthenticator
|
|||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"ILIAS",
|
"ILIAS",
|
||||||
]
|
]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class ILIAS:
|
class ILIAS:
|
||||||
FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")
|
FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")
|
||||||
DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)")
|
DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)")
|
||||||
|
|
||||||
def __init__(self, base_path, cookie_file):
|
def __init__(self, base_path, cookie_file):
|
||||||
self.base_path = base_path
|
self.base_path = base_path
|
||||||
|
|
||||||
self._auth = ShibbolethAuthenticator(base_path / cookie_file)
|
self._auth = ShibbolethAuthenticator(base_path / cookie_file)
|
||||||
|
|
||||||
async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
|
async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
|
||||||
logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")
|
logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")
|
||||||
|
|
||||||
sync_path = pathlib.Path(self.base_path, to_dir)
|
sync_path = pathlib.Path(self.base_path, to_dir)
|
||||||
orga = Organizer(self.base_path, sync_path)
|
orga = Organizer(self.base_path, sync_path)
|
||||||
|
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
|
files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
|
||||||
await self._download(orga, files, transform)
|
await self._download(orga, files, transform)
|
||||||
|
|
||||||
orga.clean_sync_dir()
|
orga.clean_sync_dir()
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._auth.close()
|
await self._auth.close()
|
||||||
|
|
||||||
async def _crawl(self, dir_path, dir_id, filter_):
|
async def _crawl(self, dir_path, dir_id, filter_):
|
||||||
soup = await self._auth.get_webpage(dir_id)
|
soup = await self._auth.get_webpage(dir_id)
|
||||||
|
|
||||||
found_files = []
|
found_files = []
|
||||||
|
|
||||||
files = self._find_files(soup)
|
files = self._find_files(soup)
|
||||||
for (name, file_id) in files:
|
for (name, file_id) in files:
|
||||||
path = dir_path / name
|
path = dir_path / name
|
||||||
found_files.append((path, file_id))
|
found_files.append((path, file_id))
|
||||||
logger.debug(f"Found file {path}")
|
logger.debug(f"Found file {path}")
|
||||||
|
|
||||||
dirs = self._find_dirs(soup)
|
dirs = self._find_dirs(soup)
|
||||||
for (name, ref_id) in dirs:
|
for (name, ref_id) in dirs:
|
||||||
path = dir_path / name
|
path = dir_path / name
|
||||||
logger.debug(f"Found dir {path}")
|
logger.debug(f"Found dir {path}")
|
||||||
if filter_(path):
|
if filter_(path):
|
||||||
logger.info(f"Searching {path}")
|
logger.info(f"Searching {path}")
|
||||||
files = await self._crawl(path, ref_id, filter_)
|
files = await self._crawl(path, ref_id, filter_)
|
||||||
found_files.extend(files)
|
found_files.extend(files)
|
||||||
else:
|
else:
|
||||||
logger.info(f"Not searching {path}")
|
logger.info(f"Not searching {path}")
|
||||||
|
|
||||||
return found_files
|
return found_files
|
||||||
|
|
||||||
async def _download(self, orga, files, transform):
|
async def _download(self, orga, files, transform):
|
||||||
for (path, file_id) in sorted(files):
|
for (path, file_id) in sorted(files):
|
||||||
to_path = transform(path)
|
to_path = transform(path)
|
||||||
if to_path is not None:
|
if to_path is not None:
|
||||||
temp_path = orga.temp_file()
|
temp_path = orga.temp_file()
|
||||||
await self._auth.download_file(file_id, temp_path)
|
await self._auth.download_file(file_id, temp_path)
|
||||||
orga.add_file(temp_path, to_path)
|
orga.add_file(temp_path, to_path)
|
||||||
|
|
||||||
def _find_files(self, soup):
|
def _find_files(self, soup):
|
||||||
files = []
|
files = []
|
||||||
file_names = set()
|
file_names = set()
|
||||||
|
|
||||||
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE})
|
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE})
|
||||||
for element in found:
|
for element in found:
|
||||||
file_stem = element.string.strip().replace("/", ".")
|
file_stem = element.string.strip().replace("/", ".")
|
||||||
file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip()
|
file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip()
|
||||||
file_id = re.search(self.FILE_RE, element.get("href")).group(1)
|
file_id = re.search(self.FILE_RE, element.get("href")).group(1)
|
||||||
|
|
||||||
file_name = f"{file_stem}.{file_type}"
|
file_name = f"{file_stem}.{file_type}"
|
||||||
if file_name in file_names:
|
if file_name in file_names:
|
||||||
counter = 1
|
counter = 1
|
||||||
while True:
|
while True:
|
||||||
file_name = f"{file_stem} (duplicate {counter}).{file_type}"
|
file_name = f"{file_stem} (duplicate {counter}).{file_type}"
|
||||||
if file_name in file_names:
|
if file_name in file_names:
|
||||||
counter += 1
|
counter += 1
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
files.append((file_name, file_id))
|
files.append((file_name, file_id))
|
||||||
file_names.add(file_name)
|
file_names.add(file_name)
|
||||||
|
|
||||||
return files
|
return files
|
||||||
|
|
||||||
def _find_dirs(self, soup):
|
def _find_dirs(self, soup):
|
||||||
dirs = []
|
dirs = []
|
||||||
|
|
||||||
found = soup.find_all("div", {"class": "alert", "role": "alert"})
|
found = soup.find_all("div", {"class": "alert", "role": "alert"})
|
||||||
if found:
|
if found:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE})
|
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE})
|
||||||
for element in found:
|
for element in found:
|
||||||
dir_name = element.string.strip().replace("/", ".")
|
dir_name = element.string.strip().replace("/", ".")
|
||||||
ref_id = re.search(self.DIR_RE, element.get("href")).group(1)
|
ref_id = re.search(self.DIR_RE, element.get("href")).group(1)
|
||||||
dir_id = f"fold_{ref_id}"
|
dir_id = f"fold_{ref_id}"
|
||||||
dirs.append((dir_name, dir_id))
|
dirs.append((dir_name, dir_id))
|
||||||
|
|
||||||
return dirs
|
return dirs
|
||||||
|
@ -19,211 +19,211 @@ from .read_write_lock import ReadWriteLock
|
|||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"ShibbolethAuthenticator",
|
"ShibbolethAuthenticator",
|
||||||
]
|
]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class ShibbolethAuthenticator:
|
class ShibbolethAuthenticator:
|
||||||
|
|
||||||
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
|
ILIAS_GOTO = "https://ilias.studium.kit.edu/goto.php"
|
||||||
|
|
||||||
RETRY_ATTEMPTS = 5
|
RETRY_ATTEMPTS = 5
|
||||||
RETRY_DELAY = 1 # seconds
|
RETRY_DELAY = 1 # seconds
|
||||||
CHUNK_SIZE = 1024**2
|
CHUNK_SIZE = 1024**2
|
||||||
|
|
||||||
ALLOWED_CONTENT_TYPES = [
|
ALLOWED_CONTENT_TYPES = [
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
"application/zip",
|
"application/zip",
|
||||||
"text/xml",
|
"text/xml",
|
||||||
"text/plain",
|
"text/plain",
|
||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
"image/png",
|
"image/png",
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, cookie_path=None):
|
def __init__(self, cookie_path=None):
|
||||||
self._cookie_path = cookie_path
|
self._cookie_path = cookie_path
|
||||||
|
|
||||||
# Authentication and file/page download should not happen at the same time.
|
# Authentication and file/page download should not happen at the same time.
|
||||||
# Authenticating counts as writing, file/page downloads as reading.
|
# Authenticating counts as writing, file/page downloads as reading.
|
||||||
self._lock = ReadWriteLock()
|
self._lock = ReadWriteLock()
|
||||||
|
|
||||||
# Only one self._authenticate() should be started, even if multiple self.get_page()s
|
# Only one self._authenticate() should be started, even if multiple self.get_page()s
|
||||||
# notice they're logged in.
|
# notice they're logged in.
|
||||||
# If self._event is not None, authenticating is currently in progress.
|
# If self._event is not None, authenticating is currently in progress.
|
||||||
self._event = None
|
self._event = None
|
||||||
|
|
||||||
jar = aiohttp.CookieJar()
|
jar = aiohttp.CookieJar()
|
||||||
if self._cookie_path is not None:
|
if self._cookie_path is not None:
|
||||||
try:
|
try:
|
||||||
jar.load(self._cookie_path)
|
jar.load(self._cookie_path)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
self._session = aiohttp.ClientSession(cookie_jar=jar)
|
self._session = aiohttp.ClientSession(cookie_jar=jar)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._session.close()
|
await self._session.close()
|
||||||
|
|
||||||
async def _post(self, url, params=None, data=None):
|
async def _post(self, url, params=None, data=None):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
for t in range(self.RETRY_ATTEMPTS):
|
||||||
try:
|
try:
|
||||||
async with self._session.post(url, params=params, data=data) as resp:
|
async with self._session.post(url, params=params, data=data) as resp:
|
||||||
text = await resp.text()
|
text = await resp.text()
|
||||||
return resp.url, text
|
return resp.url, text
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
await asyncio.sleep(self.RETRY_DELAY)
|
||||||
|
|
||||||
logger.error(f"Could not POST {url} params:{params} data:{data}.")
|
logger.error(f"Could not POST {url} params:{params} data:{data}.")
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
||||||
|
|
||||||
async def _get(self, url, params=None):
|
async def _get(self, url, params=None):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
for t in range(self.RETRY_ATTEMPTS):
|
||||||
try:
|
try:
|
||||||
async with self._session.get(url, params=params) as resp:
|
async with self._session.get(url, params=params) as resp:
|
||||||
text = await resp.text()
|
text = await resp.text()
|
||||||
return resp.url, text
|
return resp.url, text
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
await asyncio.sleep(self.RETRY_DELAY)
|
||||||
|
|
||||||
logger.error(f"Could not GET {url} params:{params}.")
|
logger.error(f"Could not GET {url} params:{params}.")
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
||||||
|
|
||||||
def _login_successful(self, soup):
|
def _login_successful(self, soup):
|
||||||
saml_response = soup.find("input", {"name": "SAMLResponse"})
|
saml_response = soup.find("input", {"name": "SAMLResponse"})
|
||||||
relay_state = soup.find("input", {"name": "RelayState"})
|
relay_state = soup.find("input", {"name": "RelayState"})
|
||||||
return saml_response is not None and relay_state is not None
|
return saml_response is not None and relay_state is not None
|
||||||
|
|
||||||
def _save_cookies(self):
|
def _save_cookies(self):
|
||||||
logger.info(f"Saving cookies to {self._cookie_path}")
|
logger.info(f"Saving cookies to {self._cookie_path}")
|
||||||
if self._cookie_path is not None:
|
if self._cookie_path is not None:
|
||||||
self._session.cookie_jar.save(self._cookie_path)
|
self._session.cookie_jar.save(self._cookie_path)
|
||||||
|
|
||||||
# WARNING: Only use self._ensure_authenticated() to authenticate,
|
# WARNING: Only use self._ensure_authenticated() to authenticate,
|
||||||
# don't call self._authenticate() itself.
|
# don't call self._authenticate() itself.
|
||||||
async def _authenticate(self):
|
async def _authenticate(self):
|
||||||
async with self._lock.write():
|
async with self._lock.write():
|
||||||
# Equivalent: Click on "Mit KIT-Account anmelden" button in
|
# Equivalent: Click on "Mit KIT-Account anmelden" button in
|
||||||
# https://ilias.studium.kit.edu/login.php
|
# https://ilias.studium.kit.edu/login.php
|
||||||
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
|
url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
|
||||||
data = {
|
data = {
|
||||||
"sendLogin": "1",
|
"sendLogin": "1",
|
||||||
"idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
|
"idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
|
||||||
"target": "/shib_login.php",
|
"target": "/shib_login.php",
|
||||||
"home_organization_selection": "Mit KIT-Account anmelden",
|
"home_organization_selection": "Mit KIT-Account anmelden",
|
||||||
}
|
}
|
||||||
logger.debug("Begin authentication process with ILIAS")
|
logger.debug("Begin authentication process with ILIAS")
|
||||||
url, text = await self._post(url, data=data)
|
url, text = await self._post(url, data=data)
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
|
||||||
# Attempt to login using credentials, if necessary
|
# Attempt to login using credentials, if necessary
|
||||||
while not self._login_successful(soup):
|
while not self._login_successful(soup):
|
||||||
form = soup.find("form", {"class": "form2", "method": "post"})
|
form = soup.find("form", {"class": "form2", "method": "post"})
|
||||||
action = form["action"]
|
action = form["action"]
|
||||||
|
|
||||||
print("Please enter Shibboleth credentials.")
|
print("Please enter Shibboleth credentials.")
|
||||||
username = getpass.getpass(prompt="Username: ")
|
username = getpass.getpass(prompt="Username: ")
|
||||||
password = getpass.getpass(prompt="Password: ")
|
password = getpass.getpass(prompt="Password: ")
|
||||||
|
|
||||||
# Equivalent: Enter credentials in
|
# Equivalent: Enter credentials in
|
||||||
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
|
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
|
||||||
url = "https://idp.scc.kit.edu" + action
|
url = "https://idp.scc.kit.edu" + action
|
||||||
data = {
|
data = {
|
||||||
"_eventId_proceed": "",
|
"_eventId_proceed": "",
|
||||||
"j_username": username,
|
"j_username": username,
|
||||||
"j_password": password,
|
"j_password": password,
|
||||||
}
|
}
|
||||||
logger.debug("Attempt to log in to Shibboleth using credentials")
|
logger.debug("Attempt to log in to Shibboleth using credentials")
|
||||||
url, text = await self._post(url, data=data)
|
url, text = await self._post(url, data=data)
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
|
||||||
if not self._login_successful(soup):
|
if not self._login_successful(soup):
|
||||||
print("Incorrect credentials.")
|
print("Incorrect credentials.")
|
||||||
|
|
||||||
# Saving progress: Successfully authenticated with Shibboleth
|
# Saving progress: Successfully authenticated with Shibboleth
|
||||||
self._save_cookies()
|
self._save_cookies()
|
||||||
|
|
||||||
relay_state = soup.find("input", {"name": "RelayState"})["value"]
|
relay_state = soup.find("input", {"name": "RelayState"})["value"]
|
||||||
saml_response = soup.find("input", {"name": "SAMLResponse"})["value"]
|
saml_response = soup.find("input", {"name": "SAMLResponse"})["value"]
|
||||||
|
|
||||||
# Equivalent: Being redirected via JS automatically
|
# Equivalent: Being redirected via JS automatically
|
||||||
# (or clicking "Continue" if you have JS disabled)
|
# (or clicking "Continue" if you have JS disabled)
|
||||||
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
|
url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
|
||||||
data = {
|
data = {
|
||||||
"RelayState": relay_state,
|
"RelayState": relay_state,
|
||||||
"SAMLResponse": saml_response,
|
"SAMLResponse": saml_response,
|
||||||
}
|
}
|
||||||
logger.debug("Redirect back to ILIAS with login information")
|
logger.debug("Redirect back to ILIAS with login information")
|
||||||
url, text = await self._post(url, data=data)
|
url, text = await self._post(url, data=data)
|
||||||
|
|
||||||
# Saving progress: Successfully authenticated with Ilias
|
# Saving progress: Successfully authenticated with Ilias
|
||||||
self._save_cookies()
|
self._save_cookies()
|
||||||
|
|
||||||
async def _ensure_authenticated(self):
|
async def _ensure_authenticated(self):
|
||||||
if self._event is None:
|
if self._event is None:
|
||||||
self._event = asyncio.Event()
|
self._event = asyncio.Event()
|
||||||
logger.info("Not logged in, authentication required.")
|
logger.info("Not logged in, authentication required.")
|
||||||
await self._authenticate()
|
await self._authenticate()
|
||||||
self._event.set()
|
self._event.set()
|
||||||
self._event = None
|
self._event = None
|
||||||
else:
|
else:
|
||||||
await self._event.wait()
|
await self._event.wait()
|
||||||
|
|
||||||
def _is_logged_in(self, soup):
|
def _is_logged_in(self, soup):
|
||||||
userlog = soup.find("li", {"id": "userlog"})
|
userlog = soup.find("li", {"id": "userlog"})
|
||||||
return userlog is not None
|
return userlog is not None
|
||||||
|
|
||||||
async def get_webpage_refid(self, ref_id):
|
async def get_webpage_refid(self, ref_id):
|
||||||
return await self.get_webpage(f"fold_{ref_id}")
|
return await self.get_webpage(f"fold_{ref_id}")
|
||||||
|
|
||||||
async def get_webpage(self, object_id):
|
async def get_webpage(self, object_id):
|
||||||
params = {"target": object_id}
|
params = {"target": object_id}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
async with self._lock.read():
|
async with self._lock.read():
|
||||||
logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
|
logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
|
||||||
_, text = await self._get(self.ILIAS_GOTO, params=params)
|
_, text = await self._get(self.ILIAS_GOTO, params=params)
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return soup
|
return soup
|
||||||
else:
|
else:
|
||||||
await self._ensure_authenticated()
|
await self._ensure_authenticated()
|
||||||
|
|
||||||
async def _download(self, url, params, to_path):
|
async def _download(self, url, params, to_path):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
for t in range(self.RETRY_ATTEMPTS):
|
||||||
try:
|
try:
|
||||||
async with self._session.get(url, params=params) as resp:
|
async with self._session.get(url, params=params) as resp:
|
||||||
if resp.content_type in self.ALLOWED_CONTENT_TYPES:
|
if resp.content_type in self.ALLOWED_CONTENT_TYPES:
|
||||||
# Yay, we got the file (as long as it's a PDF)
|
# Yay, we got the file (as long as it's a PDF)
|
||||||
await utils.stream_to_path(resp, to_path)
|
await utils.stream_to_path(resp, to_path)
|
||||||
return True
|
return True
|
||||||
elif resp.content_type == "text/html":
|
elif resp.content_type == "text/html":
|
||||||
# Dangit, we're probably not logged in.
|
# Dangit, we're probably not logged in.
|
||||||
text = await resp.text()
|
text = await resp.text()
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).")
|
raise utils.UnknownFileTypeException(f"Attempting to download a web page (use get_webpage() instead).")
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
# What *did* we get?
|
# What *did* we get?
|
||||||
raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.")
|
raise utils.UnknownFileTypeException(f"Unknown file of type {resp.content_type}.")
|
||||||
|
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
await asyncio.sleep(self.RETRY_DELAY)
|
||||||
|
|
||||||
logger.error(f"Could not download {url} params:{params}.")
|
logger.error(f"Could not download {url} params:{params}.")
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
||||||
|
|
||||||
async def download_file(self, file_id, to_path):
|
async def download_file(self, file_id, to_path):
|
||||||
params = {"target": file_id}
|
params = {"target": file_id}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
async with self._lock.read():
|
async with self._lock.read():
|
||||||
success = await self._download(self.ILIAS_GOTO, params, to_path)
|
success = await self._download(self.ILIAS_GOTO, params, to_path)
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
await self._ensure_authenticated()
|
await self._ensure_authenticated()
|
||||||
|
164
PFERD/norbert.py
164
PFERD/norbert.py
@ -12,113 +12,113 @@ from .organizer import Organizer
|
|||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Norbert",
|
"Norbert",
|
||||||
]
|
]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class Norbert:
|
class Norbert:
|
||||||
BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/"
|
BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/"
|
||||||
LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$")
|
LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$")
|
||||||
|
|
||||||
RETRY_ATTEMPTS = 5
|
RETRY_ATTEMPTS = 5
|
||||||
RETRY_DELAY = 1 # seconds
|
RETRY_DELAY = 1 # seconds
|
||||||
|
|
||||||
def __init__(self, base_path):
|
def __init__(self, base_path):
|
||||||
self.base_path = base_path
|
self.base_path = base_path
|
||||||
|
|
||||||
self._session = aiohttp.ClientSession()
|
self._session = aiohttp.ClientSession()
|
||||||
|
|
||||||
async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True):
|
async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True):
|
||||||
logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.")
|
logging.info(f" Synchronizing to {to_dir} using the Norbert synchronizer.")
|
||||||
|
|
||||||
sync_path = pathlib.Path(self.base_path, to_dir)
|
sync_path = pathlib.Path(self.base_path, to_dir)
|
||||||
orga = Organizer(self.base_path, sync_path)
|
orga = Organizer(self.base_path, sync_path)
|
||||||
|
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
files = await self._crawl()
|
files = await self._crawl()
|
||||||
await self._download(orga, files, transform, unzip)
|
await self._download(orga, files, transform, unzip)
|
||||||
|
|
||||||
orga.clean_sync_dir()
|
orga.clean_sync_dir()
|
||||||
orga.clean_temp_dir()
|
orga.clean_temp_dir()
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._session.close()
|
await self._session.close()
|
||||||
|
|
||||||
async def _crawl(self):
|
async def _crawl(self):
|
||||||
url = self.BASE_URL
|
url = self.BASE_URL
|
||||||
async with self._session.get(url) as resp:
|
async with self._session.get(url) as resp:
|
||||||
raw = await resp.read()
|
raw = await resp.read()
|
||||||
# replace undecodeable characters with a placeholder
|
# replace undecodeable characters with a placeholder
|
||||||
text = raw.decode("utf-8", "replace")
|
text = raw.decode("utf-8", "replace")
|
||||||
soup = bs4.BeautifulSoup(text, "html.parser")
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
|
|
||||||
for found in soup.find_all("a", href=self.LINK_RE):
|
for found in soup.find_all("a", href=self.LINK_RE):
|
||||||
url = found["href"]
|
url = found["href"]
|
||||||
full_url = self.BASE_URL + url
|
full_url = self.BASE_URL + url
|
||||||
|
|
||||||
filename = re.search(self.LINK_RE, url).group(1)
|
filename = re.search(self.LINK_RE, url).group(1)
|
||||||
path = pathlib.PurePath(filename)
|
path = pathlib.PurePath(filename)
|
||||||
|
|
||||||
logger.debug(f"Found zip file {filename} at {full_url}")
|
logger.debug(f"Found zip file {filename} at {full_url}")
|
||||||
|
|
||||||
files.append((path, full_url))
|
files.append((path, full_url))
|
||||||
|
|
||||||
return files
|
return files
|
||||||
|
|
||||||
async def _download(self, orga, files, transform, unzip):
|
async def _download(self, orga, files, transform, unzip):
|
||||||
for path, url in sorted(files):
|
for path, url in sorted(files):
|
||||||
# Yes, we want the zip file contents
|
# Yes, we want the zip file contents
|
||||||
if unzip(path):
|
if unzip(path):
|
||||||
logger.debug(f"Downloading and unzipping {path}")
|
logger.debug(f"Downloading and unzipping {path}")
|
||||||
zip_path = utils.rename(path, path.stem)
|
zip_path = utils.rename(path, path.stem)
|
||||||
|
|
||||||
# Download zip file
|
# Download zip file
|
||||||
temp_file = orga.temp_file()
|
temp_file = orga.temp_file()
|
||||||
await self._download_zip(url, temp_file)
|
await self._download_zip(url, temp_file)
|
||||||
|
|
||||||
# Search the zip file for files to extract
|
# Search the zip file for files to extract
|
||||||
temp_dir = orga.temp_dir()
|
temp_dir = orga.temp_dir()
|
||||||
with zipfile.ZipFile(temp_file, "r") as zf:
|
with zipfile.ZipFile(temp_file, "r") as zf:
|
||||||
for info in zf.infolist():
|
for info in zf.infolist():
|
||||||
# Only interested in the files themselves, the directory
|
# Only interested in the files themselves, the directory
|
||||||
# structure is created automatically by orga.add_file()
|
# structure is created automatically by orga.add_file()
|
||||||
if info.is_dir():
|
if info.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
file_path = zip_path / pathlib.PurePath(info.filename)
|
file_path = zip_path / pathlib.PurePath(info.filename)
|
||||||
logger.debug(f"Found {info.filename} at path {file_path}")
|
logger.debug(f"Found {info.filename} at path {file_path}")
|
||||||
|
|
||||||
new_path = transform(file_path)
|
new_path = transform(file_path)
|
||||||
if new_path is not None:
|
if new_path is not None:
|
||||||
# Extract to temp file and add, the usual deal
|
# Extract to temp file and add, the usual deal
|
||||||
temp_file = orga.temp_file()
|
temp_file = orga.temp_file()
|
||||||
extracted_path = zf.extract(info, temp_dir)
|
extracted_path = zf.extract(info, temp_dir)
|
||||||
extracted_path = pathlib.Path(extracted_path)
|
extracted_path = pathlib.Path(extracted_path)
|
||||||
orga.add_file(extracted_path, new_path)
|
orga.add_file(extracted_path, new_path)
|
||||||
|
|
||||||
# No, we only want the zip file itself
|
# No, we only want the zip file itself
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Only downloading {path}")
|
logger.debug(f"Only downloading {path}")
|
||||||
|
|
||||||
new_path = transform(path)
|
new_path = transform(path)
|
||||||
if new_path is not None:
|
if new_path is not None:
|
||||||
temp_file = orga.temp_file()
|
temp_file = orga.temp_file()
|
||||||
await self._download_zip(url, temp_file)
|
await self._download_zip(url, temp_file)
|
||||||
orga.add_file(temp_file, new_path)
|
orga.add_file(temp_file, new_path)
|
||||||
|
|
||||||
async def _download_zip(self, url, to_path):
|
async def _download_zip(self, url, to_path):
|
||||||
for t in range(self.RETRY_ATTEMPTS):
|
for t in range(self.RETRY_ATTEMPTS):
|
||||||
try:
|
try:
|
||||||
async with self._session.get(url) as resp:
|
async with self._session.get(url) as resp:
|
||||||
await utils.stream_to_path(resp, to_path)
|
await utils.stream_to_path(resp, to_path)
|
||||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||||
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s")
|
||||||
await asyncio.sleep(self.RETRY_DELAY)
|
await asyncio.sleep(self.RETRY_DELAY)
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
logger.error(f"Could not download {url}")
|
logger.error(f"Could not download {url}")
|
||||||
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.")
|
||||||
|
@ -6,135 +6,135 @@ import shutil
|
|||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Organizer",
|
"Organizer",
|
||||||
]
|
]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class Organizer:
|
class Organizer:
|
||||||
def __init__(self, base_dir, sync_dir):
|
def __init__(self, base_dir, sync_dir):
|
||||||
"""
|
"""
|
||||||
base_dir - the .tmp directory will be created here
|
base_dir - the .tmp directory will be created here
|
||||||
sync_dir - synced files will be moved here
|
sync_dir - synced files will be moved here
|
||||||
Both are expected to be concrete pathlib paths.
|
Both are expected to be concrete pathlib paths.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self._base_dir = base_dir
|
self._base_dir = base_dir
|
||||||
self._sync_dir = sync_dir
|
self._sync_dir = sync_dir
|
||||||
|
|
||||||
self._temp_dir = pathlib.Path(self._base_dir, ".tmp")
|
self._temp_dir = pathlib.Path(self._base_dir, ".tmp")
|
||||||
self._temp_nr = 0
|
self._temp_nr = 0
|
||||||
|
|
||||||
# check if base/sync dir exist?
|
# check if base/sync dir exist?
|
||||||
|
|
||||||
self._added_files = set()
|
self._added_files = set()
|
||||||
|
|
||||||
def clean_temp_dir(self):
|
def clean_temp_dir(self):
|
||||||
if self._temp_dir.exists():
|
if self._temp_dir.exists():
|
||||||
shutil.rmtree(self._temp_dir)
|
shutil.rmtree(self._temp_dir)
|
||||||
self._temp_dir.mkdir(exist_ok=True)
|
self._temp_dir.mkdir(exist_ok=True)
|
||||||
logger.debug(f"Cleaned temp dir: {self._temp_dir}")
|
logger.debug(f"Cleaned temp dir: {self._temp_dir}")
|
||||||
|
|
||||||
def temp_dir(self):
|
def temp_dir(self):
|
||||||
nr = self._temp_nr
|
nr = self._temp_nr
|
||||||
self._temp_nr += 1
|
self._temp_nr += 1
|
||||||
temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve()
|
temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve()
|
||||||
logger.debug(f"Produced new temp dir: {temp_dir}")
|
logger.debug(f"Produced new temp dir: {temp_dir}")
|
||||||
return temp_dir
|
return temp_dir
|
||||||
|
|
||||||
def temp_file(self):
|
def temp_file(self):
|
||||||
# generate the path to a new temp file in base_path/.tmp/
|
# generate the path to a new temp file in base_path/.tmp/
|
||||||
# make sure no two paths are the same
|
# make sure no two paths are the same
|
||||||
nr = self._temp_nr
|
nr = self._temp_nr
|
||||||
self._temp_nr += 1
|
self._temp_nr += 1
|
||||||
temp_file = pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve()
|
temp_file = pathlib.Path(self._temp_dir, f"{nr:08}.tmp").resolve()
|
||||||
logger.debug(f"Produced new temp file: {temp_file}")
|
logger.debug(f"Produced new temp file: {temp_file}")
|
||||||
return temp_file
|
return temp_file
|
||||||
|
|
||||||
def add_file(self, from_path, to_path):
|
def add_file(self, from_path, to_path):
|
||||||
if not from_path.exists():
|
if not from_path.exists():
|
||||||
raise utils.FileNotFoundException(f"Could not add file at {from_path}")
|
raise utils.FileNotFoundException(f"Could not add file at {from_path}")
|
||||||
|
|
||||||
# check if sync_dir/to_path is inside sync_dir?
|
# check if sync_dir/to_path is inside sync_dir?
|
||||||
to_path = pathlib.Path(self._sync_dir, to_path)
|
to_path = pathlib.Path(self._sync_dir, to_path)
|
||||||
|
|
||||||
if to_path.exists() and to_path.is_dir():
|
if to_path.exists() and to_path.is_dir():
|
||||||
if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False):
|
if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False):
|
||||||
shutil.rmtree(to_path)
|
shutil.rmtree(to_path)
|
||||||
else:
|
else:
|
||||||
logger.warn(f"Could not add file {to_path}")
|
logger.warn(f"Could not add file {to_path}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if to_path.exists():
|
if to_path.exists():
|
||||||
if filecmp.cmp(from_path, to_path, shallow=False):
|
if filecmp.cmp(from_path, to_path, shallow=False):
|
||||||
logger.info(f"Ignored {to_path}")
|
logger.info(f"Ignored {to_path}")
|
||||||
|
|
||||||
# remember path for later reference
|
# remember path for later reference
|
||||||
self._added_files.add(to_path.resolve())
|
self._added_files.add(to_path.resolve())
|
||||||
logger.debug(f"Added file {to_path.resolve()}")
|
logger.debug(f"Added file {to_path.resolve()}")
|
||||||
|
|
||||||
# No further action needed, especially not overwriting symlinks...
|
# No further action needed, especially not overwriting symlinks...
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
logger.info(f"Different file at {to_path}")
|
logger.info(f"Different file at {to_path}")
|
||||||
else:
|
else:
|
||||||
logger.info(f"New file at {to_path}")
|
logger.info(f"New file at {to_path}")
|
||||||
|
|
||||||
# copy the file from from_path to sync_dir/to_path
|
# copy the file from from_path to sync_dir/to_path
|
||||||
# If the file being replaced was a symlink, the link itself is overwritten,
|
# If the file being replaced was a symlink, the link itself is overwritten,
|
||||||
# not the file the link points to.
|
# not the file the link points to.
|
||||||
to_path.parent.mkdir(parents=True, exist_ok=True)
|
to_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
from_path.replace(to_path)
|
from_path.replace(to_path)
|
||||||
logger.debug(f"Moved {from_path} to {to_path}")
|
logger.debug(f"Moved {from_path} to {to_path}")
|
||||||
|
|
||||||
# remember path for later reference, after the new file was written
|
# remember path for later reference, after the new file was written
|
||||||
# This is necessary here because otherwise, resolve() would resolve the symlink too.
|
# This is necessary here because otherwise, resolve() would resolve the symlink too.
|
||||||
self._added_files.add(to_path.resolve())
|
self._added_files.add(to_path.resolve())
|
||||||
logger.debug(f"Added file {to_path.resolve()}")
|
logger.debug(f"Added file {to_path.resolve()}")
|
||||||
|
|
||||||
def clean_sync_dir(self):
|
def clean_sync_dir(self):
|
||||||
self._clean_dir(self._sync_dir, remove_parent=False)
|
self._clean_dir(self._sync_dir, remove_parent=False)
|
||||||
logger.debug(f"Cleaned sync dir: {self._sync_dir}")
|
logger.debug(f"Cleaned sync dir: {self._sync_dir}")
|
||||||
|
|
||||||
def _clean_dir(self, path, remove_parent=True):
|
def _clean_dir(self, path, remove_parent=True):
|
||||||
for child in sorted(path.iterdir()):
|
for child in sorted(path.iterdir()):
|
||||||
logger.debug(f"Looking at {child.resolve()}")
|
logger.debug(f"Looking at {child.resolve()}")
|
||||||
if child.is_dir():
|
if child.is_dir():
|
||||||
self._clean_dir(child, remove_parent=True)
|
self._clean_dir(child, remove_parent=True)
|
||||||
elif child.resolve() not in self._added_files:
|
elif child.resolve() not in self._added_files:
|
||||||
if self._prompt_yes_no(f"Delete {child}?", default=False):
|
if self._prompt_yes_no(f"Delete {child}?", default=False):
|
||||||
child.unlink()
|
child.unlink()
|
||||||
logger.debug(f"Deleted {child}")
|
logger.debug(f"Deleted {child}")
|
||||||
|
|
||||||
if remove_parent:
|
if remove_parent:
|
||||||
try:
|
try:
|
||||||
path.rmdir()
|
path.rmdir()
|
||||||
except OSError: # directory not empty
|
except OSError: # directory not empty
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _prompt_yes_no(self, question, default=None):
|
def _prompt_yes_no(self, question, default=None):
|
||||||
if default is True:
|
if default is True:
|
||||||
prompt = "[Y/n]"
|
prompt = "[Y/n]"
|
||||||
elif default is False:
|
elif default is False:
|
||||||
prompt = "[y/N]"
|
prompt = "[y/N]"
|
||||||
else:
|
else:
|
||||||
prompt = "[y/n]"
|
prompt = "[y/n]"
|
||||||
|
|
||||||
text = f"{question} {prompt} "
|
text = f"{question} {prompt} "
|
||||||
WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'."
|
WRONG_REPLY = "Please reply with 'yes'/'y' or 'no'/'n'."
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
response = input(text).strip().lower()
|
response = input(text).strip().lower()
|
||||||
if response in {"yes", "ye", "y"}:
|
if response in {"yes", "ye", "y"}:
|
||||||
return True
|
return True
|
||||||
elif response in {"no", "n"}:
|
elif response in {"no", "n"}:
|
||||||
return False
|
return False
|
||||||
elif response == "":
|
elif response == "":
|
||||||
if default is None:
|
if default is None:
|
||||||
print(WRONG_REPLY)
|
print(WRONG_REPLY)
|
||||||
else:
|
else:
|
||||||
return default
|
return default
|
||||||
else:
|
else:
|
||||||
print(WRONG_REPLY)
|
print(WRONG_REPLY)
|
||||||
|
|
||||||
# How to use:
|
# How to use:
|
||||||
#
|
#
|
||||||
|
@ -2,39 +2,39 @@ import os
|
|||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"get_base_dir",
|
"get_base_dir",
|
||||||
"move",
|
"move",
|
||||||
"rename",
|
"rename",
|
||||||
"stream_to_path",
|
"stream_to_path",
|
||||||
"OutOfTriesException",
|
"OutOfTriesException",
|
||||||
"UnknownFileTypeException",
|
"UnknownFileTypeException",
|
||||||
"FileNotFoundException",
|
"FileNotFoundException",
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_base_dir(script_file):
|
def get_base_dir(script_file):
|
||||||
return pathlib.Path(os.path.dirname(os.path.abspath(script_file)))
|
return pathlib.Path(os.path.dirname(os.path.abspath(script_file)))
|
||||||
|
|
||||||
def move(path, from_folders, to_folders):
|
def move(path, from_folders, to_folders):
|
||||||
l = len(from_folders)
|
l = len(from_folders)
|
||||||
if path.parts[:l] == from_folders:
|
if path.parts[:l] == from_folders:
|
||||||
return pathlib.PurePath(*to_folders, *path.parts[l:])
|
return pathlib.PurePath(*to_folders, *path.parts[l:])
|
||||||
|
|
||||||
def rename(path, to_name):
|
def rename(path, to_name):
|
||||||
return pathlib.PurePath(*path.parts[:-1], to_name)
|
return pathlib.PurePath(*path.parts[:-1], to_name)
|
||||||
|
|
||||||
async def stream_to_path(resp, to_path, chunk_size=1024**2):
|
async def stream_to_path(resp, to_path, chunk_size=1024**2):
|
||||||
with open(to_path, 'wb') as fd:
|
with open(to_path, 'wb') as fd:
|
||||||
while True:
|
while True:
|
||||||
chunk = await resp.content.read(chunk_size)
|
chunk = await resp.content.read(chunk_size)
|
||||||
if not chunk:
|
if not chunk:
|
||||||
break
|
break
|
||||||
fd.write(chunk)
|
fd.write(chunk)
|
||||||
|
|
||||||
class OutOfTriesException(Exception):
|
class OutOfTriesException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class UnknownFileTypeException(Exception):
|
class UnknownFileTypeException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class FileNotFoundException(Exception):
|
class FileNotFoundException(Exception):
|
||||||
pass
|
pass
|
||||||
|
Loading…
Reference in New Issue
Block a user