From 068fe77dcf250766b227362b89ce0a61a6f786c1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 26 Nov 2018 17:00:17 +0000 Subject: [PATCH] Clean up minor things - improve logging messages - allow more download file formats - strip file names --- PFERD/ffm.py | 2 +- PFERD/ilias.py | 12 +++++------- PFERD/ilias_authenticators.py | 2 +- PFERD/organizer.py | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/PFERD/ffm.py b/PFERD/ffm.py index 1122b72..b20b414 100644 --- a/PFERD/ffm.py +++ b/PFERD/ffm.py @@ -28,7 +28,7 @@ class FfM: self._session = aiohttp.ClientSession() async def synchronize(self, urlpart, to_dir, transform=lambda x: x): - logging.info(f"Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") + logging.info(f" Synchronizing {urlpart} to {to_dir} using the FfM synchronizer.") sync_path = pathlib.Path(self.base_path, to_dir) orga = Organizer(self.base_path, sync_path) diff --git a/PFERD/ilias.py b/PFERD/ilias.py index 9885826..438e6bd 100644 --- a/PFERD/ilias.py +++ b/PFERD/ilias.py @@ -26,7 +26,7 @@ class ILIAS: self._auth = ShibbolethAuthenticator(base_path / cookie_file) async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True): - logging.info(f"Synchronizing {ref_id} to {to_dir} using the ILIAS synchronizer.") + logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.") sync_path = pathlib.Path(self.base_path, to_dir) orga = Organizer(self.base_path, sync_path) @@ -67,7 +67,7 @@ class ILIAS: return found_files async def _download(self, orga, files, transform): - for (path, file_id) in files: + for (path, file_id) in sorted(files): to_path = transform(path) if to_path is not None: temp_path = orga.temp_file() @@ -79,11 +79,9 @@ class ILIAS: found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE}) for element in found: - file_stem = element.string - file_id = re.search(self.FILE_RE, element.get("href")).group(1) - - # find out file type + file_stem = element.string.strip() file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip() + file_id = re.search(self.FILE_RE, element.get("href")).group(1) file_name = f"{file_stem}.{file_type}" files.append((file_name, file_id)) @@ -95,7 +93,7 @@ class ILIAS: found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE}) for element in found: - dir_name = element.string + dir_name = element.string.strip() ref_id = re.search(self.DIR_RE, element.get("href")).group(1) dir_id = f"fold_{ref_id}" dirs.append((dir_name, dir_id)) diff --git a/PFERD/ilias_authenticators.py b/PFERD/ilias_authenticators.py index 3588640..ad10374 100644 --- a/PFERD/ilias_authenticators.py +++ b/PFERD/ilias_authenticators.py @@ -185,7 +185,7 @@ class ShibbolethAuthenticator: for t in range(self.RETRY_ATTEMPTS): try: async with self._session.get(url, params=params) as resp: - if resp.content_type == "application/pdf": + if resp.content_type in ["application/pdf", "application/zip", "text/xml"]: # Yay, we got the file (as long as it's a PDF) await utils.stream_to_path(resp, to_path) return True diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 0a0f2b5..55ecf6f 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -55,7 +55,7 @@ class Organizer: if to_path.exists(): if filecmp.cmp(from_path, to_path, shallow=False): - logger.info(f"Done nothing at {to_path}") + logger.info(f"Ignored {to_path}") # No further action needed, especially not overwriting symlinks... return else: