mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Adjust to new crawl / download names
This commit is contained in:
parent
e724ff7c93
commit
953a1bba93
@ -116,7 +116,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
self._link_file_redirect_delay = section.link_file_redirect_delay()
|
||||
self._link_file_use_plaintext = section.link_file_use_plaintext()
|
||||
|
||||
async def crawl(self) -> None:
|
||||
async def _run(self) -> None:
|
||||
if isinstance(self._target, int):
|
||||
log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
|
||||
await self._crawl_course(self._target)
|
||||
@ -127,11 +127,6 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
|
||||
await self._crawl_url(self._target)
|
||||
|
||||
if self.error_free:
|
||||
await self.cleanup()
|
||||
else:
|
||||
log.explain_topic("Skipping file cleanup as errors occurred earlier")
|
||||
|
||||
async def _crawl_course(self, course_id: int) -> None:
|
||||
# Start crawling at the given course
|
||||
root_url = url_set_query_param(
|
||||
@ -144,10 +139,14 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
await self._crawl_url(self._base_url)
|
||||
|
||||
async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None:
|
||||
cl = await self.crawl(PurePath("."))
|
||||
if not cl:
|
||||
return
|
||||
|
||||
tasks = []
|
||||
|
||||
# TODO: Retry this when the crawl and download bar are reworked
|
||||
async with self.crawl_bar(PurePath("Root element")):
|
||||
async with cl:
|
||||
soup = await self._get_page(url)
|
||||
|
||||
if expected_id is not None:
|
||||
@ -165,14 +164,12 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
|
||||
# We might not want to crawl this directory-ish page.
|
||||
# This is not in #handle_element, as the download methods check it themselves and therefore
|
||||
# would perform this check twice - messing with the explain output
|
||||
if not self.should_crawl(path):
|
||||
cl = await self.crawl(path)
|
||||
if not cl:
|
||||
return
|
||||
|
||||
tasks = []
|
||||
async with self.crawl_bar(path):
|
||||
async with cl:
|
||||
soup = await self._get_page(url)
|
||||
page = IliasPage(soup, url, parent)
|
||||
|
||||
@ -189,7 +186,9 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
if element.type == IliasElementType.FILE:
|
||||
await self._download_file(element, element_path)
|
||||
elif element.type == IliasElementType.FORUM:
|
||||
log.explain_topic(f"Skipping forum at {escape(str(element_path))}")
|
||||
log.explain_topic(f"Decision: Crawl {escape(str(element_path))}")
|
||||
log.explain("Is a forum")
|
||||
log.explain("Answer: No")
|
||||
elif element.type == IliasElementType.LINK:
|
||||
await self._download_link(element, element_path)
|
||||
elif element.type == IliasElementType.VIDEO:
|
||||
@ -208,20 +207,19 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
if not dl:
|
||||
return
|
||||
|
||||
async with self.download_bar(element_path):
|
||||
async with dl as (bar, sink):
|
||||
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
|
||||
async with self.session.get(export_url) as response:
|
||||
html_page: BeautifulSoup = soupify(await response.read())
|
||||
real_url: str = html_page.select_one("a").get("href").strip()
|
||||
|
||||
async with dl as sink:
|
||||
content = link_template_plain if self._link_file_use_plaintext else link_template_rich
|
||||
content = content.replace("{{link}}", real_url)
|
||||
content = content.replace("{{name}}", element.name)
|
||||
content = content.replace("{{description}}", str(element.description))
|
||||
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
|
||||
sink.file.write(content.encode("utf-8"))
|
||||
sink.done()
|
||||
content = link_template_plain if self._link_file_use_plaintext else link_template_rich
|
||||
content = content.replace("{{link}}", real_url)
|
||||
content = content.replace("{{name}}", element.name)
|
||||
content = content.replace("{{description}}", str(element.description))
|
||||
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
|
||||
sink.file.write(content.encode("utf-8"))
|
||||
sink.done()
|
||||
|
||||
async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||
# Videos will NOT be redownloaded - their content doesn't really change and they are chunky
|
||||
@ -229,19 +227,18 @@ class KitIliasWebCrawler(HttpCrawler):
|
||||
if not dl:
|
||||
return
|
||||
|
||||
async with self.download_bar(element_path) as bar:
|
||||
async with dl as (bar, sink):
|
||||
page = IliasPage(await self._get_page(element.url), element.url, element)
|
||||
real_element = page.get_child_elements()[0]
|
||||
|
||||
async with dl as sink:
|
||||
await self._stream_from_url(real_element.url, sink, bar)
|
||||
await self._stream_from_url(real_element.url, sink, bar)
|
||||
|
||||
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
|
||||
dl = await self.download(element_path, mtime=element.mtime)
|
||||
if not dl:
|
||||
return
|
||||
|
||||
async with self.download_bar(element_path) as bar, dl as sink:
|
||||
async with dl as (bar, sink):
|
||||
await self._stream_from_url(element.url, sink, bar)
|
||||
|
||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||
|
Loading…
Reference in New Issue
Block a user