mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Add some repeat annotations to the ILIAS crawler
This commit is contained in:
parent
9ec0d3e16a
commit
1c226c31aa
@ -550,6 +550,7 @@ class IliasCrawler(HttpCrawler):
|
|||||||
async def _crawl_desktop(self) -> None:
|
async def _crawl_desktop(self) -> None:
|
||||||
await self._crawl_url(self._base_url)
|
await self._crawl_url(self._base_url)
|
||||||
|
|
||||||
|
@arepeat(3)
|
||||||
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
@ -567,8 +568,11 @@ class IliasCrawler(HttpCrawler):
|
|||||||
page = IliasPage(soup, url, None)
|
page = IliasPage(soup, url, None)
|
||||||
for child in page.get_child_elements():
|
for child in page.get_child_elements():
|
||||||
tasks.append(self._handle_ilias_element(PurePath("."), child))
|
tasks.append(self._handle_ilias_element(PurePath("."), child))
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
@arepeat(3)
|
||||||
|
@anoncritical
|
||||||
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
|
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
|
||||||
tasks = []
|
tasks = []
|
||||||
async with self.crawl_bar(path):
|
async with self.crawl_bar(path):
|
||||||
@ -580,6 +584,7 @@ class IliasCrawler(HttpCrawler):
|
|||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
@anoncritical
|
||||||
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
|
async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
|
||||||
element_path = PurePath(parent_path, element.name)
|
element_path = PurePath(parent_path, element.name)
|
||||||
|
|
||||||
@ -601,6 +606,7 @@ class IliasCrawler(HttpCrawler):
|
|||||||
# TODO: Proper exception
|
# TODO: Proper exception
|
||||||
raise RuntimeError(f"Unknown type: {element.type!r}")
|
raise RuntimeError(f"Unknown type: {element.type!r}")
|
||||||
|
|
||||||
|
@arepeat(3)
|
||||||
async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
|
async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||||
# Videos will NOT be redownloaded - their content doesn't really change and they are chunky
|
# Videos will NOT be redownloaded - their content doesn't really change and they are chunky
|
||||||
dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER)
|
dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER)
|
||||||
@ -621,6 +627,7 @@ class IliasCrawler(HttpCrawler):
|
|||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
|
@arepeat(3)
|
||||||
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
|
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||||
dl = await self.download(element_path, mtime=element.mtime)
|
dl = await self.download(element_path, mtime=element.mtime)
|
||||||
if not dl:
|
if not dl:
|
||||||
@ -638,19 +645,18 @@ class IliasCrawler(HttpCrawler):
|
|||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
|
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
|
||||||
|
# This function will retry itself a few times if it is not logged in - it won't handle
|
||||||
|
# connection errors
|
||||||
if retries_left < 0:
|
if retries_left < 0:
|
||||||
# TODO: Proper exception
|
# TODO: Proper exception
|
||||||
raise RuntimeError("Get page failed too often")
|
raise RuntimeError("Get page failed too often")
|
||||||
print(url)
|
print(url)
|
||||||
try:
|
async with self.session.get(url) as request:
|
||||||
async with self.session.get(url) as request:
|
soup = soupify(await request.read())
|
||||||
soup = soupify(await request.read())
|
if self._is_logged_in(soup):
|
||||||
if self._is_logged_in(soup):
|
return soup
|
||||||
return soup
|
|
||||||
|
|
||||||
await self._shibboleth_login.login(self.session)
|
await self._shibboleth_login.login(self.session)
|
||||||
except Exception:
|
|
||||||
return await self._get_page(url, retries_left - 1)
|
|
||||||
|
|
||||||
return await self._get_page(url, retries_left - 1)
|
return await self._get_page(url, retries_left - 1)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user