From 8198c9ecaac158e4ca0e8894839a7d7ba7404a17 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 May 2020 15:53:31 +0200 Subject: [PATCH] Reorder methods a bit --- PFERD/ilias/__init__.py | 3 +- PFERD/ilias/crawler.py | 69 +++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/PFERD/ilias/__init__.py b/PFERD/ilias/__init__.py index e9a4a96..0a5f08b 100644 --- a/PFERD/ilias/__init__.py +++ b/PFERD/ilias/__init__.py @@ -3,7 +3,8 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/). """ from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator -from .crawler import IliasCrawler, IliasDirectoryFilter, IliasElementType +from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, + IliasElementType) from .downloader import (IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, download_everything, download_modified_or_new) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 934665e..eb7e812 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -28,7 +28,7 @@ PRETTY = PrettyLogger(LOGGER) class IliasElementType(Enum): """ - The type of an ilias directory. + The type of an ilias element. """ REGULAR_FOLDER = "REGULAR_FOLDER" VIDEO_FOLDER = "VIDEO_FOLDER" @@ -43,6 +43,7 @@ IliasDirectoryFilter = Callable[[Path, IliasElementType], bool] class IliasCrawlerEntry: + # pylint: disable=too-few-public-methods """ An ILIAS crawler entry used internally to find, catalogue and recursively crawl elements. """ @@ -97,12 +98,6 @@ class IliasCrawler: self._authenticator = authenticator self.dir_filter = dir_filter - def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: - """ - Create an absolute url from an tag. - """ - return urljoin(self._base_url, link_tag.get("href")) - @staticmethod def _url_set_query_param(url: str, param: str, value: str) -> str: """ @@ -138,7 +133,7 @@ class IliasCrawler: # And treat it as a folder entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url) - return self._entries_to_download_infos(entries) + return self._iterate_entries_to_download_infos(entries) def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: response: requests.Response = self._session.get(root_url) @@ -154,9 +149,9 @@ class IliasCrawler: entries: List[IliasCrawlerEntry] = self._crawl_folder( Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI" ) - return self._entries_to_download_infos(entries) + return self._iterate_entries_to_download_infos(entries) - def _entries_to_download_infos( + def _iterate_entries_to_download_infos( self, entries: List[IliasCrawlerEntry] ) -> List[IliasDownloadInfo]: @@ -201,6 +196,36 @@ class IliasCrawler: return result + def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: + """ + Crawl all files in a folder-like element. + """ + soup = self._get_page(url, {}) + + result: List[IliasCrawlerEntry] = [] + + # Fetch all links and throw them to the general interpreter + links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") + for link in links: + abs_url = self._abs_url_from_link(link) + element_path = Path(folder_path, link.getText().strip()) + element_type = self._find_type_from_link(element_path, link, abs_url) + + if element_type == IliasElementType.REGULAR_FILE: + result += self._crawl_file(folder_path, link, abs_url) + elif element_type is not None: + result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] + else: + PRETTY.warning(f"Found element without a type at {str(element_path)!r}") + + return result + + def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: + """ + Create an absolute url from an tag. + """ + return urljoin(self._base_url, link_tag.get("href")) + @staticmethod def _find_type_from_link( path: Path, @@ -515,30 +540,6 @@ class IliasCrawler: return results - def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: - """ - Crawl all files in a folder-like element. - """ - soup = self._get_page(url, {}) - - result: List[IliasCrawlerEntry] = [] - - # Fetch all links and throw them to the general interpreter - links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") - for link in links: - abs_url = self._abs_url_from_link(link) - element_path = Path(folder_path, link.getText().strip()) - element_type = self._find_type_from_link(element_path, link, abs_url) - - if element_type == IliasElementType.REGULAR_FILE: - result += self._crawl_file(folder_path, link, abs_url) - elif element_type is not None: - result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] - else: - PRETTY.warning(f"Found element without a type at {str(element_path)!r}") - - return result - def _get_page(self, url: str, params: Dict[str, Any]) -> bs4.BeautifulSoup: """ Fetches a page from ILIAS, authenticating when needed.