From 34da5d4d19c2af3428e40120b36f5522e64cfa5e Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 26 Nov 2018 13:39:06 +0000 Subject: [PATCH] Sync files from ILIAS --- PFERD/__init__.py | 6 +- PFERD/ilias.py | 103 ++++++++++++++++++++++++++++++++++ PFERD/ilias_authenticators.py | 4 +- PFERD/utils.py | 3 +- example_config.py | 20 +++++-- 5 files changed, 124 insertions(+), 12 deletions(-) create mode 100644 PFERD/ilias.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index b580005..978aed7 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,12 +1,10 @@ from .ffm import * -from .ilias_authenticators import * -from .organizer import * +from .ilias import * from .utils import * __all__ = ( ffm.__all__ + - ilias_authenticators.__all__ + - organizer.__all__ + + ilias.__all__ + utils.__all__ + [] ) diff --git a/PFERD/ilias.py b/PFERD/ilias.py new file mode 100644 index 0000000..9885826 --- /dev/null +++ b/PFERD/ilias.py @@ -0,0 +1,103 @@ +# ILIAS + +import aiohttp +import asyncio +import bs4 +import logging +import pathlib +import re + +from .organizer import Organizer +from .ilias_authenticators import ShibbolethAuthenticator +from . import utils + +__all__ = [ + "ILIAS", +] +logger = logging.getLogger(__name__) + +class ILIAS: + FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)") + DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)") + + def __init__(self, base_path, cookie_file): + self.base_path = base_path + + self._auth = ShibbolethAuthenticator(base_path / cookie_file) + + async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): + logging.info(f"Synchronizing {ref_id} to {to_dir} using the ILIAS synchronizer.") + + sync_path = pathlib.Path(self.base_path, to_dir) + orga = Organizer(self.base_path, sync_path) + + orga.clean_temp_dir() + + files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter) + await self._download(orga, files, transform) + + orga.clean_sync_dir() + orga.clean_temp_dir() + + async def close(self): + await self._auth.close() + + async def _crawl(self, dir_path, dir_id, filter_): + soup = await self._auth.get_webpage(dir_id) + + found_files = [] + + files = self._find_files(soup) + for (name, file_id) in files: + path = dir_path / name + found_files.append((path, file_id)) + logger.debug(f"Found file {path}") + + dirs = self._find_dirs(soup) + for (name, ref_id) in dirs: + path = dir_path / name + logger.debug(f"Found dir {path}") + if filter_(path): + logger.info(f"Searching {path}") + files = await self._crawl(path, ref_id, filter_) + found_files.extend(files) + else: + logger.info(f"Not searching {path}") + + return found_files + + async def _download(self, orga, files, transform): + for (path, file_id) in files: + to_path = transform(path) + if to_path is not None: + temp_path = orga.temp_file() + await self._auth.download_file(file_id, temp_path) + orga.add_file(temp_path, to_path) + + def _find_files(self, soup): + files = [] + + found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) + for element in found: + file_stem = element.string + file_id = re.search(self.FILE_RE, element.get("href")).group(1) + + # find out file type + file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() + + file_name = f"{file_stem}.{file_type}" + files.append((file_name, file_id)) + + return files + + def _find_dirs(self, soup): + dirs = [] + + found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) + for element in found: + dir_name = element.string + ref_id = re.search(self.DIR_RE, element.get("href")).group(1) + dir_id = f"fold_{ref_id}" + dirs.append((dir_name, dir_id)) + + return dirs diff --git a/PFERD/ilias_authenticators.py b/PFERD/ilias_authenticators.py index 120ef96..3588640 100644 --- a/PFERD/ilias_authenticators.py +++ b/PFERD/ilias_authenticators.py @@ -86,7 +86,7 @@ class ShibbolethAuthenticator: return saml_response is not None and relay_state is not None def _save_cookies(self): - logger.info(f"Saving cookies to {self._cookie_path!r}") + logger.info(f"Saving cookies to {self._cookie_path}") if self._cookie_path is not None: self._session.cookie_jar.save(self._cookie_path) @@ -172,7 +172,7 @@ class ShibbolethAuthenticator: while True: async with self._lock.read(): - logger.debug(f"Getting {url} {params}") + logger.debug(f"Getting {self.ILIAS_GOTO} {params}") _, text = await self._get(self.ILIAS_GOTO, params=params) soup = bs4.BeautifulSoup(text, "html.parser") diff --git a/PFERD/utils.py b/PFERD/utils.py index bd82468..f125991 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -1,4 +1,5 @@ import os +import pathlib __all__ = [ "get_base_dir", @@ -9,7 +10,7 @@ __all__ = [ ] def get_base_dir(script_file): - return os.path.dirname(os.path.abspath(script_file)) + return pathlib.Path(os.path.dirname(os.path.abspath(script_file))) async def stream_to_path(resp, to_path, chunk_size=1024**2): with open(to_path, 'wb') as fd: diff --git a/example_config.py b/example_config.py index 02c8bb6..01c5da7 100644 --- a/example_config.py +++ b/example_config.py @@ -3,7 +3,7 @@ import asyncio import logging import pathlib -logging.basicConfig(level=logging.INFO, format=PFERD.LOG_FORMAT) +logging.basicConfig(level=logging.DEBUG, format=PFERD.LOG_FORMAT) base_dir = PFERD.get_base_dir(__file__) @@ -19,11 +19,21 @@ def ana1(old_path): return old_path +def la1_filter(path): + if path.match("Tutorien/*"): + return False + + return True + async def main(): - ffm = PFERD.FfM(base_dir) - await ffm.synchronize("iana2/lehre/hm1info2018w/de", "HM1", transform=hm1) - await ffm.synchronize("iana1/lehre/ana12018w/de", "Ana1", transform=ana1) - await ffm.close() + #ffm = PFERD.FfM(base_dir) + #await ffm.synchronize("iana2/lehre/hm1info2018w", "HM1", transform=hm1) + #await ffm.synchronize("iana1/lehre/ana12018w", "Ana1", transform=ana1) + #await ffm.close() + + ilias = PFERD.ILIAS(base_dir, "cookie_jar") + await ilias.synchronize("874938", "LA1", filter=la1_filter) + await ilias.close() if __name__ == "__main__": asyncio.run(main())