mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-10-31 04:42:42 +01:00 
			
		
		
		
	Detect unexpected root page redirects and abort operation
This commit is contained in:
		| @@ -26,6 +26,7 @@ ambiguous situations. | |||||||
| - Crawling of courses with the timeline view as the default tab | - Crawling of courses with the timeline view as the default tab | ||||||
| - Crawling of file and custom opencast cards | - Crawling of file and custom opencast cards | ||||||
| - Crawling of button cards without descriptions | - Crawling of button cards without descriptions | ||||||
|  | - Abort crawling when encountering an unexpected ilias root page redirect | ||||||
|  |  | ||||||
| ### Added | ### Added | ||||||
| - `no-delete-prompt-override` conflict resolution strategy | - `no-delete-prompt-override` conflict resolution strategy | ||||||
|   | |||||||
| @@ -79,6 +79,16 @@ class IliasPage: | |||||||
|         self._page_type = source_element.type if source_element else None |         self._page_type = source_element.type if source_element else None | ||||||
|         self._source_name = source_element.name if source_element else "" |         self._source_name = source_element.name if source_element else "" | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def is_root_page(soup: BeautifulSoup) -> bool: | ||||||
|  |         permalink = soup.find(id="current_perma_link") | ||||||
|  |         if permalink is None: | ||||||
|  |             return False | ||||||
|  |         value = permalink.attrs.get("value") | ||||||
|  |         if value is None: | ||||||
|  |             return False | ||||||
|  |         return "goto.php?target=root_" in value | ||||||
|  |  | ||||||
|     def get_child_elements(self) -> List[IliasPageElement]: |     def get_child_elements(self) -> List[IliasPageElement]: | ||||||
|         """ |         """ | ||||||
|         Return all child page elements you can find here. |         Return all child page elements you can find here. | ||||||
|   | |||||||
| @@ -239,7 +239,7 @@ instance's greatest bottleneck. | |||||||
|  |  | ||||||
|                 # Duplicated code, but the root page is special - we want to avoid fetching it twice! |                 # Duplicated code, but the root page is special - we want to avoid fetching it twice! | ||||||
|                 while next_stage_url: |                 while next_stage_url: | ||||||
|                     soup = await self._get_page(next_stage_url) |                     soup = await self._get_page(next_stage_url, root_page_allowed=True) | ||||||
|  |  | ||||||
|                     if current_parent is None and expected_id is not None: |                     if current_parent is None and expected_id is not None: | ||||||
|                         perma_link_element: Tag = soup.find(id="current_perma_link") |                         perma_link_element: Tag = soup.find(id="current_perma_link") | ||||||
| @@ -739,12 +739,12 @@ instance's greatest bottleneck. | |||||||
|             sink.file.write(content.encode("utf-8")) |             sink.file.write(content.encode("utf-8")) | ||||||
|             sink.done() |             sink.done() | ||||||
|  |  | ||||||
|     async def _get_page(self, url: str) -> BeautifulSoup: |     async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: | ||||||
|         auth_id = await self._current_auth_id() |         auth_id = await self._current_auth_id() | ||||||
|         async with self.session.get(url) as request: |         async with self.session.get(url) as request: | ||||||
|             soup = soupify(await request.read()) |             soup = soupify(await request.read()) | ||||||
|             if self._is_logged_in(soup): |             if self._is_logged_in(soup): | ||||||
|                 return soup |                 return self._verify_page(soup, url, root_page_allowed) | ||||||
|  |  | ||||||
|         # We weren't authenticated, so try to do that |         # We weren't authenticated, so try to do that | ||||||
|         await self.authenticate(auth_id) |         await self.authenticate(auth_id) | ||||||
| @@ -753,9 +753,21 @@ instance's greatest bottleneck. | |||||||
|         async with self.session.get(url) as request: |         async with self.session.get(url) as request: | ||||||
|             soup = soupify(await request.read()) |             soup = soupify(await request.read()) | ||||||
|             if self._is_logged_in(soup): |             if self._is_logged_in(soup): | ||||||
|                 return soup |                 return self._verify_page(soup, url, root_page_allowed) | ||||||
|         raise CrawlError("get_page failed even after authenticating") |         raise CrawlError("get_page failed even after authenticating") | ||||||
|  |  | ||||||
|  |     def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: | ||||||
|  |         if IliasPage.is_root_page(soup) and not root_page_allowed: | ||||||
|  |             raise CrawlError( | ||||||
|  |                 "Unexpectedly encountered ILIAS root page. " | ||||||
|  |                 "This usually happens because the ILIAS instance is broken. " | ||||||
|  |                 "If so, wait a day or two and try again. " | ||||||
|  |                 "It could also happen because a crawled element links to the ILIAS root page. " | ||||||
|  |                 "If so, use a transform with a ! as target to ignore the particular element. " | ||||||
|  |                 f"The redirect came from {url}" | ||||||
|  |             ) | ||||||
|  |         return soup | ||||||
|  |  | ||||||
|     async def _post_authenticated( |     async def _post_authenticated( | ||||||
|         self, |         self, | ||||||
|         url: str, |         url: str, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 I-Al-Istannen
					I-Al-Istannen