pferd/PFERD/ilias.py

# ILIAS

import aiohttp
import asyncio
import bs4
import logging
import pathlib
import re

from .organizer import Organizer
from .ilias_authenticators import ShibbolethAuthenticator
from . import utils

__all__ = [
	"ILIAS",
]
logger = logging.getLogger(__name__)

class ILIAS:
	FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")
	DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)")

	def __init__(self, base_path, cookie_file):
		self.base_path = base_path

		self._auth = ShibbolethAuthenticator(base_path / cookie_file)

	async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):
		logging.info(f"    Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")

		sync_path = pathlib.Path(self.base_path, to_dir)
		orga = Organizer(self.base_path, sync_path)

		orga.clean_temp_dir()

		files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)
		await self._download(orga, files, transform)

		orga.clean_sync_dir()
		orga.clean_temp_dir()

	async def close(self):
		await self._auth.close()

	async def _crawl(self, dir_path, dir_id, filter_):
		soup = await self._auth.get_webpage(dir_id)

		found_files = []

		files = self._find_files(soup)
		for (name, file_id) in files:
			path = dir_path / name
			found_files.append((path, file_id))
			logger.debug(f"Found file {path}")

		dirs = self._find_dirs(soup)
		for (name, ref_id) in dirs:
			path = dir_path / name
			logger.debug(f"Found dir {path}")
			if filter_(path):
				logger.info(f"Searching {path}")
				files = await self._crawl(path, ref_id, filter_)
				found_files.extend(files)
			else:
				logger.info(f"Not searching {path}")

		return found_files

	async def _download(self, orga, files, transform):
		for (path, file_id) in sorted(files):
			to_path = transform(path)
			if to_path is not None:
				temp_path = orga.temp_file()
				await self._auth.download_file(file_id, temp_path)
				orga.add_file(temp_path, to_path)

	def _find_files(self, soup):
		files = []

		found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE})
		for element in found:
			file_stem = element.string.strip()
			file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip()
			file_id = re.search(self.FILE_RE, element.get("href")).group(1)

			file_name = f"{file_stem}.{file_type}"
			files.append((file_name, file_id))

		return files

	def _find_dirs(self, soup):
		dirs = []

		found = soup.find_all("div", {"class": "alert", "role": "alert"})
		if found:
			return []

		found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE})
		for element in found:
			dir_name = element.string.strip()
			ref_id = re.search(self.DIR_RE, element.get("href")).group(1)
			dir_id = f"fold_{ref_id}"
			dirs.append((dir_name, dir_id))

		return dirs
Sync files from ILIAS 2018-11-26 14:39:06 +01:00			`# ILIAS`

			`import aiohttp`
			`import asyncio`
			`import bs4`
			`import logging`
			`import pathlib`
			`import re`

			`from .organizer import Organizer`
			`from .ilias_authenticators import ShibbolethAuthenticator`
			`from . import utils`

			`__all__ = [`
			`"ILIAS",`
			`]`
			`logger = logging.getLogger(__name__)`

			`class ILIAS:`
			`FILE_RE = re.compile(r"goto\.php\?target=(file_\d+_download)")`
			`DIR_RE = re.compile(r"ilias\.php\?ref_id=(\d+)")`

			`def __init__(self, base_path, cookie_file):`
			`self.base_path = base_path`

			`self._auth = ShibbolethAuthenticator(base_path / cookie_file)`

			`async def synchronize(self, ref_id, to_dir, transform=lambda x: x, filter=lambda x: True):`
Clean up minor things - improve logging messages - allow more download file formats - strip file names 2018-11-26 18:00:17 +01:00			`logging.info(f" Synchronizing ref_id {ref_id} to {to_dir} using the ILIAS synchronizer.")`
Sync files from ILIAS 2018-11-26 14:39:06 +01:00
			`sync_path = pathlib.Path(self.base_path, to_dir)`
			`orga = Organizer(self.base_path, sync_path)`

			`orga.clean_temp_dir()`

			`files = await self._crawl(pathlib.PurePath(), f"fold_{ref_id}", filter)`
			`await self._download(orga, files, transform)`

			`orga.clean_sync_dir()`
			`orga.clean_temp_dir()`

			`async def close(self):`
			`await self._auth.close()`

			`async def _crawl(self, dir_path, dir_id, filter_):`
			`soup = await self._auth.get_webpage(dir_id)`

			`found_files = []`

			`files = self._find_files(soup)`
			`for (name, file_id) in files:`
			`path = dir_path / name`
			`found_files.append((path, file_id))`
			`logger.debug(f"Found file {path}")`

			`dirs = self._find_dirs(soup)`
			`for (name, ref_id) in dirs:`
			`path = dir_path / name`
			`logger.debug(f"Found dir {path}")`
			`if filter_(path):`
			`logger.info(f"Searching {path}")`
			`files = await self._crawl(path, ref_id, filter_)`
			`found_files.extend(files)`
			`else:`
			`logger.info(f"Not searching {path}")`

			`return found_files`

			`async def _download(self, orga, files, transform):`
Clean up minor things - improve logging messages - allow more download file formats - strip file names 2018-11-26 18:00:17 +01:00			`for (path, file_id) in sorted(files):`
Sync files from ILIAS 2018-11-26 14:39:06 +01:00			`to_path = transform(path)`
			`if to_path is not None:`
			`temp_path = orga.temp_file()`
			`await self._auth.download_file(file_id, temp_path)`
			`orga.add_file(temp_path, to_path)`

			`def _find_files(self, soup):`
			`files = []`

			`found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.FILE_RE})`
			`for element in found:`
Clean up minor things - improve logging messages - allow more download file formats - strip file names 2018-11-26 18:00:17 +01:00			`file_stem = element.string.strip()`
Sync files from ILIAS 2018-11-26 14:39:06 +01:00			`file_type = element.parent.parent.parent.find("div", {"class": "il_ItemProperties"}).find("span").string.strip()`
Clean up minor things - improve logging messages - allow more download file formats - strip file names 2018-11-26 18:00:17 +01:00			`file_id = re.search(self.FILE_RE, element.get("href")).group(1)`
Sync files from ILIAS 2018-11-26 14:39:06 +01:00
			`file_name = f"{file_stem}.{file_type}"`
			`files.append((file_name, file_id))`

			`return files`

			`def _find_dirs(self, soup):`
			`dirs = []`

Fix tut crawling 2018-11-27 11:28:39 +01:00			`found = soup.find_all("div", {"class": "alert", "role": "alert"})`
			`if found:`
			`return []`

Sync files from ILIAS 2018-11-26 14:39:06 +01:00			`found = soup.find_all("a", {"class": "il_ContainerItemTitle", "href": self.DIR_RE})`
			`for element in found:`
Clean up minor things - improve logging messages - allow more download file formats - strip file names 2018-11-26 18:00:17 +01:00			`dir_name = element.string.strip()`
Sync files from ILIAS 2018-11-26 14:39:06 +01:00			`ref_id = re.search(self.DIR_RE, element.get("href")).group(1)`
			`dir_id = f"fold_{ref_id}"`
			`dirs.append((dir_name, dir_id))`

			`return dirs`