mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-31 04:42:42 +01:00 
			
		
		
		
	Add norbert synchronizer
This commit is contained in:
		| @@ -1,10 +1,12 @@ | |||||||
| from .ffm import * | from .ffm import * | ||||||
| from .ilias import * | from .ilias import * | ||||||
|  | from .norbert import * | ||||||
| from .utils import * | from .utils import * | ||||||
|  |  | ||||||
| __all__ = ( | __all__ = ( | ||||||
| 	ffm.__all__ + | 	ffm.__all__ + | ||||||
| 	ilias.__all__ + | 	ilias.__all__ + | ||||||
|  | 	norbert.__all__ + | ||||||
| 	utils.__all__ + | 	utils.__all__ + | ||||||
| 	[] | 	[] | ||||||
| ) | ) | ||||||
|   | |||||||
							
								
								
									
										124
									
								
								PFERD/norbert.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								PFERD/norbert.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,124 @@ | |||||||
|  | # Norberts Prog-Tuts | ||||||
|  |  | ||||||
|  | import aiohttp | ||||||
|  | import asyncio | ||||||
|  | import bs4 | ||||||
|  | import logging | ||||||
|  | import pathlib | ||||||
|  | import re | ||||||
|  | import zipfile | ||||||
|  |  | ||||||
|  | from .organizer import Organizer | ||||||
|  | from . import utils | ||||||
|  |  | ||||||
|  | __all__ = [ | ||||||
|  | 	"Norbert", | ||||||
|  | ] | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  | class Norbert: | ||||||
|  | 	BASE_URL = "https://studwww.informatik.kit.edu/~s_blueml/" | ||||||
|  | 	LINK_RE = re.compile(r"^progtut/.*/(.*\.zip)$") | ||||||
|  |  | ||||||
|  | 	RETRY_ATTEMPTS = 5 | ||||||
|  | 	RETRY_DELAY = 1 # seconds | ||||||
|  |  | ||||||
|  | 	def __init__(self, base_path): | ||||||
|  | 		self.base_path = base_path | ||||||
|  |  | ||||||
|  | 		self._session = aiohttp.ClientSession() | ||||||
|  |  | ||||||
|  | 	async def synchronize(self, to_dir, transform=lambda x: x, unzip=lambda _: True): | ||||||
|  | 		logging.info(f"    Synchronizing to {to_dir} using the Norbert synchronizer.") | ||||||
|  |  | ||||||
|  | 		sync_path = pathlib.Path(self.base_path, to_dir) | ||||||
|  | 		orga = Organizer(self.base_path, sync_path) | ||||||
|  |  | ||||||
|  | 		orga.clean_temp_dir() | ||||||
|  |  | ||||||
|  | 		files = await self._crawl() | ||||||
|  | 		await self._download(orga, files, transform, unzip) | ||||||
|  |  | ||||||
|  | 		orga.clean_sync_dir() | ||||||
|  | 		orga.clean_temp_dir() | ||||||
|  |  | ||||||
|  | 	async def close(self): | ||||||
|  | 		await self._session.close() | ||||||
|  |  | ||||||
|  | 	async def _crawl(self): | ||||||
|  | 		url = self.BASE_URL | ||||||
|  | 		async with self._session.get(url) as resp: | ||||||
|  | 			raw = await resp.read() | ||||||
|  | 			# replace undecodeable characters with a placeholder | ||||||
|  | 			text = raw.decode("utf-8", "replace") | ||||||
|  | 		soup = bs4.BeautifulSoup(text, "html.parser") | ||||||
|  |  | ||||||
|  | 		files = [] | ||||||
|  |  | ||||||
|  | 		for found in soup.find_all("a", href=self.LINK_RE): | ||||||
|  | 			url = found["href"] | ||||||
|  | 			full_url = self.BASE_URL + url | ||||||
|  |  | ||||||
|  | 			filename = re.search(self.LINK_RE, url).group(1) | ||||||
|  | 			path = pathlib.PurePath(filename) | ||||||
|  |  | ||||||
|  | 			logger.debug(f"Found zip file {filename} at {full_url}") | ||||||
|  |  | ||||||
|  | 			files.append((path, full_url)) | ||||||
|  |  | ||||||
|  | 		return files | ||||||
|  |  | ||||||
|  | 	async def _download(self, orga, files, transform, unzip): | ||||||
|  | 		for path, url in files: | ||||||
|  | 			# Yes, we want the zip file contents | ||||||
|  | 			if unzip(path): | ||||||
|  | 				logger.debug(f"Downloading and unzipping {path}") | ||||||
|  | 				zip_path = utils.rename(path, path.stem) | ||||||
|  |  | ||||||
|  | 				# Download zip file | ||||||
|  | 				temp_file = orga.temp_file() | ||||||
|  | 				await self._download_zip(url, temp_file) | ||||||
|  |  | ||||||
|  | 				# Search the zip file for files to extract | ||||||
|  | 				temp_dir = orga.temp_dir() | ||||||
|  | 				with zipfile.ZipFile(temp_file, "r") as zf: | ||||||
|  | 					for info in zf.infolist(): | ||||||
|  | 						# Only interested in the files themselves, the directory | ||||||
|  | 						# structure is created automatically by orga.add_file() | ||||||
|  | 						if info.is_dir(): | ||||||
|  | 							continue | ||||||
|  |  | ||||||
|  | 						file_path = zip_path / pathlib.PurePath(info.filename) | ||||||
|  | 						logger.debug(f"Found {info.filename} at path {file_path}") | ||||||
|  |  | ||||||
|  | 						new_path = transform(file_path) | ||||||
|  | 						if new_path is not None: | ||||||
|  | 							# Extract to temp file and add, the usual deal | ||||||
|  | 							temp_file = orga.temp_file() | ||||||
|  | 							extracted_path = zf.extract(info, temp_dir) | ||||||
|  | 							extracted_path = pathlib.Path(extracted_path) | ||||||
|  | 							orga.add_file(extracted_path, new_path) | ||||||
|  |  | ||||||
|  | 			# No, we only want the zip file itself | ||||||
|  | 			else: | ||||||
|  | 				logger.debug(f"Only downloading {path}") | ||||||
|  |  | ||||||
|  | 				new_path = transform(path) | ||||||
|  | 				if new_path is not None: | ||||||
|  | 					temp_file = orga.temp_file() | ||||||
|  | 					await self._download_zip(url, temp_file) | ||||||
|  | 					orga.add_file(temp_file, new_path) | ||||||
|  |  | ||||||
|  | 	async def _download_zip(self, url, to_path): | ||||||
|  | 		for t in range(self.RETRY_ATTEMPTS): | ||||||
|  | 			try: | ||||||
|  | 				async with self._session.get(url) as resp: | ||||||
|  | 					await utils.stream_to_path(resp, to_path) | ||||||
|  | 			except aiohttp.client_exceptions.ServerDisconnectedError: | ||||||
|  | 				logger.debug(f"Try {t+1} out of {self.RETRY_ATTEMPTS} failed, retrying in {self.RETRY_DELAY} s") | ||||||
|  | 				await asyncio.sleep(self.RETRY_DELAY) | ||||||
|  | 			else: | ||||||
|  | 				return | ||||||
|  | 		else: | ||||||
|  | 			logger.error(f"Could not download {url}") | ||||||
|  | 			raise utils.OutOfTriesException(f"Try {self.RETRY_ATTEMPTS} out of {self.RETRY_ATTEMPTS} failed.") | ||||||
| @@ -34,6 +34,13 @@ class Organizer: | |||||||
| 		self._temp_dir.mkdir(exist_ok=True) | 		self._temp_dir.mkdir(exist_ok=True) | ||||||
| 		logger.debug(f"Cleaned temp dir: {self._temp_dir}") | 		logger.debug(f"Cleaned temp dir: {self._temp_dir}") | ||||||
|  |  | ||||||
|  | 	def temp_dir(self): | ||||||
|  | 		nr = self._temp_nr | ||||||
|  | 		self._temp_nr += 1 | ||||||
|  | 		temp_dir = pathlib.Path(self._temp_dir, f"{nr:08}").resolve() | ||||||
|  | 		logger.debug(f"Produced new temp dir: {temp_dir}") | ||||||
|  | 		return temp_dir | ||||||
|  |  | ||||||
| 	def temp_file(self): | 	def temp_file(self): | ||||||
| 		# generate the path to a new temp file in base_path/.tmp/ | 		# generate the path to a new temp file in base_path/.tmp/ | ||||||
| 		# make sure no two paths are the same | 		# make sure no two paths are the same | ||||||
| @@ -50,6 +57,12 @@ class Organizer: | |||||||
| 		# check if sync_dir/to_path is inside sync_dir? | 		# check if sync_dir/to_path is inside sync_dir? | ||||||
| 		to_path = pathlib.Path(self._sync_dir, to_path) | 		to_path = pathlib.Path(self._sync_dir, to_path) | ||||||
|  |  | ||||||
|  | 		if to_path.exists() and to_path.is_dir(): | ||||||
|  | 			if self._prompt_yes_no(f"Overwrite folder {to_path} with file?", default=False): | ||||||
|  | 				shutil.rmtree(to_path) | ||||||
|  | 			else: | ||||||
|  | 				logger.warn(f"Could not add file {to_path}") | ||||||
|  | 				return | ||||||
|  |  | ||||||
| 		if to_path.exists(): | 		if to_path.exists(): | ||||||
| 			if filecmp.cmp(from_path, to_path, shallow=False): | 			if filecmp.cmp(from_path, to_path, shallow=False): | ||||||
|   | |||||||
| @@ -147,6 +147,7 @@ async def main(args): | |||||||
|  |  | ||||||
| 	ffm = PFERD.FfM(base_dir) | 	ffm = PFERD.FfM(base_dir) | ||||||
| 	ilias = PFERD.ILIAS(base_dir, "cookie_jar") | 	ilias = PFERD.ILIAS(base_dir, "cookie_jar") | ||||||
|  | 	norbert = PFERD.Norbert(base_dir) | ||||||
|  |  | ||||||
| 	if not args or "gbi" in args: | 	if not args or "gbi" in args: | ||||||
| 		await ilias.synchronize("855240", "GBI", transform=gbi_transform, filter=gbi_filter) | 		await ilias.synchronize("855240", "GBI", transform=gbi_transform, filter=gbi_filter) | ||||||
| @@ -156,9 +157,12 @@ async def main(args): | |||||||
| 		await ilias.synchronize("874938", "LA1", transform=la1_transform, filter=la1_filter) | 		await ilias.synchronize("874938", "LA1", transform=la1_transform, filter=la1_filter) | ||||||
| 	if not args or "prog" in args: | 	if not args or "prog" in args: | ||||||
| 		await ilias.synchronize("851237", "Prog", transform=prog_transform, filter=prog_filter) | 		await ilias.synchronize("851237", "Prog", transform=prog_transform, filter=prog_filter) | ||||||
|  | 	if not args or "norbert" in args: | ||||||
|  | 		await norbert.synchronize("Prog-Tut") | ||||||
|  |  | ||||||
| 	await ffm.close() | 	await ffm.close() | ||||||
| 	await ilias.close() | 	await ilias.close() | ||||||
|  | 	await norbert.close() | ||||||
|  |  | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
| 	args = sys.argv[1:] | 	args = sys.argv[1:] | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Joscha
					Joscha