Sync files from ILIAS

This commit is contained in:
Joscha 2018-11-26 13:39:06 +00:00
parent 529c4a7dda
commit 34da5d4d19
5 changed files with 124 additions and 12 deletions

View File

@ -1,12 +1,10 @@
from .ffm import * from .ffm import *
from .ilias_authenticators import * from .ilias import *
from .organizer import *
from .utils import * from .utils import *
__all__ = ( __all__ = (
ffm.__all__ + ffm.__all__ +
ilias_authenticators.__all__ + ilias.__all__ +
organizer.__all__ +
utils.__all__ + utils.__all__ +
[] []
) )

103
PFERD/ilias.py Normal file
View File

@ -0,0 +1,103 @@
# ILIAS
import aiohttp
import asyncio
import bs4
import logging
import pathlib
import re
from .organizer import Organizer
from .ilias_authenticators import ShibbolethAuthenticator
from . import utils
__all__ = [
"ILIAS",
]
logger = logging.getLogger(__name__)
class ILIAS:
FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")
DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)")
def __init__(self, base_path, cookie_file):
self.base_path = base_path
self._auth = ShibbolethAuthenticator(base_path / cookie_file)
async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
logging.info(f"Synchronizing {ref_id} to {to_dir} using the ILIAS synchronizer.")
sync_path = pathlib.Path(self.base_path, to_dir)
orga = Organizer(self.base_path, sync_path)
orga.clean_temp_dir()
files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
await self._download(orga, files, transform)
orga.clean_sync_dir()
orga.clean_temp_dir()
async def close(self):
await self._auth.close()
async def _crawl(self, dir_path, dir_id, filter_):
soup = await self._auth.get_webpage(dir_id)
found_files = []
files = self._find_files(soup)
for (name, file_id) in files:
path = dir_path / name
found_files.append((path, file_id))
logger.debug(f"Found file {path}")
dirs = self._find_dirs(soup)
for (name, ref_id) in dirs:
path = dir_path / name
logger.debug(f"Found dir {path}")
if filter_(path):
logger.info(f"Searching {path}")
files = await self._crawl(path, ref_id, filter_)
found_files.extend(files)
else:
logger.info(f"Not searching {path}")
return found_files
async def _download(self, orga, files, transform):
for (path, file_id) in files:
to_path = transform(path)
if to_path is not None:
temp_path = orga.temp_file()
await self._auth.download_file(file_id, temp_path)
orga.add_file(temp_path, to_path)
def _find_files(self, soup):
files = []
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE})
for element in found:
file_stem = element.string
file_id = re.search(self.FILE_RE, element.get("href")).group(1)
# find out file type
file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip()
file_name = f"{file_stem}.{file_type}"
files.append((file_name, file_id))
return files
def _find_dirs(self, soup):
dirs = []
found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE})
for element in found:
dir_name = element.string
ref_id = re.search(self.DIR_RE, element.get("href")).group(1)
dir_id = f"fold_{ref_id}"
dirs.append((dir_name, dir_id))
return dirs

View File

@ -86,7 +86,7 @@ class ShibbolethAuthenticator:
return saml_response is not None and relay_state is not None return saml_response is not None and relay_state is not None
def _save_cookies(self): def _save_cookies(self):
logger.info(f"Saving cookies to {self._cookie_path!r}") logger.info(f"Saving cookies to {self._cookie_path}")
if self._cookie_path is not None: if self._cookie_path is not None:
self._session.cookie_jar.save(self._cookie_path) self._session.cookie_jar.save(self._cookie_path)
@ -172,7 +172,7 @@ class ShibbolethAuthenticator:
while True: while True:
async with self._lock.read(): async with self._lock.read():
logger.debug(f"Getting {url} {params}") logger.debug(f"Getting {self.ILIAS_GOTO} {params}")
_, text = await self._get(self.ILIAS_GOTO, params=params) _, text = await self._get(self.ILIAS_GOTO, params=params)
soup = bs4.BeautifulSoup(text, "html.parser") soup = bs4.BeautifulSoup(text, "html.parser")

View File

@ -1,4 +1,5 @@
import os import os
import pathlib
__all__ = [ __all__ = [
"get_base_dir", "get_base_dir",
@ -9,7 +10,7 @@ __all__ = [
] ]
def get_base_dir(script_file): def get_base_dir(script_file):
return os.path.dirname(os.path.abspath(script_file)) return pathlib.Path(os.path.dirname(os.path.abspath(script_file)))
async def stream_to_path(resp, to_path, chunk_size=1024**2): async def stream_to_path(resp, to_path, chunk_size=1024**2):
with open(to_path, 'wb') as fd: with open(to_path, 'wb') as fd:

View File

@ -3,7 +3,7 @@ import asyncio
import logging import logging
import pathlib import pathlib
logging.basicConfig(level=logging.INFO, format=PFERD.LOG_FORMAT) logging.basicConfig(level=logging.DEBUG, format=PFERD.LOG_FORMAT)
base_dir = PFERD.get_base_dir(__file__) base_dir = PFERD.get_base_dir(__file__)
@ -19,11 +19,21 @@ def ana1(old_path):
return old_path return old_path
def la1_filter(path):
if path.match("Tutorien/*"):
return False
return True
async def main(): async def main():
ffm = PFERD.FfM(base_dir) #ffm = PFERD.FfM(base_dir)
await ffm.synchronize("iana2/lehre/hm1info2018w/de", "HM1", transform=hm1) #await ffm.synchronize("iana2/lehre/hm1info2018w", "HM1", transform=hm1)
await ffm.synchronize("iana1/lehre/ana12018w/de", "Ana1", transform=ana1) #await ffm.synchronize("iana1/lehre/ana12018w", "Ana1", transform=ana1)
await ffm.close() #await ffm.close()
ilias = PFERD.ILIAS(base_dir, "cookie_jar")
await ilias.synchronize("874938", "LA1", filter=la1_filter)
await ilias.close()
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())