mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Detect unexpected root page redirects and abort operation
This commit is contained in:
parent
443f7fe839
commit
d204dac8ce
@ -26,6 +26,7 @@ ambiguous situations.
|
|||||||
- Crawling of courses with the timeline view as the default tab
|
- Crawling of courses with the timeline view as the default tab
|
||||||
- Crawling of file and custom opencast cards
|
- Crawling of file and custom opencast cards
|
||||||
- Crawling of button cards without descriptions
|
- Crawling of button cards without descriptions
|
||||||
|
- Abort crawling when encountering an unexpected ilias root page redirect
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- `no-delete-prompt-override` conflict resolution strategy
|
- `no-delete-prompt-override` conflict resolution strategy
|
||||||
|
@ -79,6 +79,16 @@ class IliasPage:
|
|||||||
self._page_type = source_element.type if source_element else None
|
self._page_type = source_element.type if source_element else None
|
||||||
self._source_name = source_element.name if source_element else ""
|
self._source_name = source_element.name if source_element else ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_root_page(soup: BeautifulSoup) -> bool:
|
||||||
|
permalink = soup.find(id="current_perma_link")
|
||||||
|
if permalink is None:
|
||||||
|
return False
|
||||||
|
value = permalink.attrs.get("value")
|
||||||
|
if value is None:
|
||||||
|
return False
|
||||||
|
return "goto.php?target=root_" in value
|
||||||
|
|
||||||
def get_child_elements(self) -> List[IliasPageElement]:
|
def get_child_elements(self) -> List[IliasPageElement]:
|
||||||
"""
|
"""
|
||||||
Return all child page elements you can find here.
|
Return all child page elements you can find here.
|
||||||
|
@ -239,7 +239,7 @@ instance's greatest bottleneck.
|
|||||||
|
|
||||||
# Duplicated code, but the root page is special - we want to avoid fetching it twice!
|
# Duplicated code, but the root page is special - we want to avoid fetching it twice!
|
||||||
while next_stage_url:
|
while next_stage_url:
|
||||||
soup = await self._get_page(next_stage_url)
|
soup = await self._get_page(next_stage_url, root_page_allowed=True)
|
||||||
|
|
||||||
if current_parent is None and expected_id is not None:
|
if current_parent is None and expected_id is not None:
|
||||||
perma_link_element: Tag = soup.find(id="current_perma_link")
|
perma_link_element: Tag = soup.find(id="current_perma_link")
|
||||||
@ -739,12 +739,12 @@ instance's greatest bottleneck.
|
|||||||
sink.file.write(content.encode("utf-8"))
|
sink.file.write(content.encode("utf-8"))
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def _get_page(self, url: str) -> BeautifulSoup:
|
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
|
||||||
auth_id = await self._current_auth_id()
|
auth_id = await self._current_auth_id()
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return soup
|
return self._verify_page(soup, url, root_page_allowed)
|
||||||
|
|
||||||
# We weren't authenticated, so try to do that
|
# We weren't authenticated, so try to do that
|
||||||
await self.authenticate(auth_id)
|
await self.authenticate(auth_id)
|
||||||
@ -753,9 +753,21 @@ instance's greatest bottleneck.
|
|||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return soup
|
return self._verify_page(soup, url, root_page_allowed)
|
||||||
raise CrawlError("get_page failed even after authenticating")
|
raise CrawlError("get_page failed even after authenticating")
|
||||||
|
|
||||||
|
def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
||||||
|
if IliasPage.is_root_page(soup) and not root_page_allowed:
|
||||||
|
raise CrawlError(
|
||||||
|
"Unexpectedly encountered ILIAS root page. "
|
||||||
|
"This usually happens because the ILIAS instance is broken. "
|
||||||
|
"If so, wait a day or two and try again. "
|
||||||
|
"It could also happen because a crawled element links to the ILIAS root page. "
|
||||||
|
"If so, use a transform with a ! as target to ignore the particular element. "
|
||||||
|
f"The redirect came from {url}"
|
||||||
|
)
|
||||||
|
return soup
|
||||||
|
|
||||||
async def _post_authenticated(
|
async def _post_authenticated(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
|
Loading…
Reference in New Issue
Block a user