Adjust to new crawl / download names

This commit is contained in:
I-Al-Istannen 2021-05-22 23:18:05 +02:00
parent e724ff7c93
commit 953a1bba93

View File

@ -116,7 +116,7 @@ class KitIliasWebCrawler(HttpCrawler):
self._link_file_redirect_delay = section.link_file_redirect_delay() self._link_file_redirect_delay = section.link_file_redirect_delay()
self._link_file_use_plaintext = section.link_file_use_plaintext() self._link_file_use_plaintext = section.link_file_use_plaintext()
async def crawl(self) -> None: async def _run(self) -> None:
if isinstance(self._target, int): if isinstance(self._target, int):
log.explain_topic(f"Inferred crawl target: Course with id {self._target}") log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
await self._crawl_course(self._target) await self._crawl_course(self._target)
@ -127,11 +127,6 @@ class KitIliasWebCrawler(HttpCrawler):
log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}") log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
await self._crawl_url(self._target) await self._crawl_url(self._target)
if self.error_free:
await self.cleanup()
else:
log.explain_topic("Skipping file cleanup as errors occurred earlier")
async def _crawl_course(self, course_id: int) -> None: async def _crawl_course(self, course_id: int) -> None:
# Start crawling at the given course # Start crawling at the given course
root_url = url_set_query_param( root_url = url_set_query_param(
@ -144,10 +139,14 @@ class KitIliasWebCrawler(HttpCrawler):
await self._crawl_url(self._base_url) await self._crawl_url(self._base_url)
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
cl = await self.crawl(PurePath("."))
if not cl:
return
tasks = [] tasks = []
# TODO: Retry this when the crawl and download bar are reworked # TODO: Retry this when the crawl and download bar are reworked
async with self.crawl_bar(PurePath("Root element")): async with cl:
soup = await self._get_page(url) soup = await self._get_page(url)
if expected_id is not None: if expected_id is not None:
@ -165,14 +164,12 @@ class KitIliasWebCrawler(HttpCrawler):
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
# We might not want to crawl this directory-ish page. cl = await self.crawl(path)
# This is not in #handle_element, as the download methods check it themselves and therefore if not cl:
# would perform this check twice - messing with the explain output
if not self.should_crawl(path):
return return
tasks = [] tasks = []
async with self.crawl_bar(path): async with cl:
soup = await self._get_page(url) soup = await self._get_page(url)
page = IliasPage(soup, url, parent) page = IliasPage(soup, url, parent)
@ -189,7 +186,9 @@ class KitIliasWebCrawler(HttpCrawler):
if element.type == IliasElementType.FILE: if element.type == IliasElementType.FILE:
await self._download_file(element, element_path) await self._download_file(element, element_path)
elif element.type == IliasElementType.FORUM: elif element.type == IliasElementType.FORUM:
log.explain_topic(f"Skipping forum at {escape(str(element_path))}") log.explain_topic(f"Decision: Crawl {escape(str(element_path))}")
log.explain("Is a forum")
log.explain("Answer: No")
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
await self._download_link(element, element_path) await self._download_link(element, element_path)
elif element.type == IliasElementType.VIDEO: elif element.type == IliasElementType.VIDEO:
@ -208,13 +207,12 @@ class KitIliasWebCrawler(HttpCrawler):
if not dl: if not dl:
return return
async with self.download_bar(element_path): async with dl as (bar, sink):
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
async with self.session.get(export_url) as response: async with self.session.get(export_url) as response:
html_page: BeautifulSoup = soupify(await response.read()) html_page: BeautifulSoup = soupify(await response.read())
real_url: str = html_page.select_one("a").get("href").strip() real_url: str = html_page.select_one("a").get("href").strip()
async with dl as sink:
content = link_template_plain if self._link_file_use_plaintext else link_template_rich content = link_template_plain if self._link_file_use_plaintext else link_template_rich
content = content.replace("{{link}}", real_url) content = content.replace("{{link}}", real_url)
content = content.replace("{{name}}", element.name) content = content.replace("{{name}}", element.name)
@ -229,11 +227,10 @@ class KitIliasWebCrawler(HttpCrawler):
if not dl: if not dl:
return return
async with self.download_bar(element_path) as bar: async with dl as (bar, sink):
page = IliasPage(await self._get_page(element.url), element.url, element) page = IliasPage(await self._get_page(element.url), element.url, element)
real_element = page.get_child_elements()[0] real_element = page.get_child_elements()[0]
async with dl as sink:
await self._stream_from_url(real_element.url, sink, bar) await self._stream_from_url(real_element.url, sink, bar)
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
@ -241,7 +238,7 @@ class KitIliasWebCrawler(HttpCrawler):
if not dl: if not dl:
return return
async with self.download_bar(element_path) as bar, dl as sink: async with dl as (bar, sink):
await self._stream_from_url(element.url, sink, bar) await self._stream_from_url(element.url, sink, bar)
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: