Download page descriptions

This commit is contained in:
I-Al-Istannen
2022-05-11 21:16:09 +02:00
parent a5015fe9b1
commit 846c29aee1
4 changed files with 148 additions and 0 deletions

View File

@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import Links
from .ilias_html_cleaner import clean, insert_base_markup
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
TargetType = Union[str, int]
@ -215,6 +216,8 @@ instance's greatest bottleneck.
cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes
description: List[BeautifulSoup] = []
@_iorepeat(3, "crawling url")
async def gather_elements() -> None:
@ -233,9 +236,15 @@ instance's greatest bottleneck.
page = IliasPage(soup, url, None)
elements.extend(page.get_child_elements())
if description_string := page.get_description():
description.append(description_string)
# Fill up our task list with the found elements
await gather_elements()
if description:
await self._download_description(PurePath("."), description[0])
elements.sort(key=lambda e: e.id())
tasks: List[Awaitable[None]] = []
@ -265,6 +274,8 @@ instance's greatest bottleneck.
cl: CrawlToken,
) -> None:
elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes
description: List[BeautifulSoup] = []
@_iorepeat(3, "crawling folder")
async def gather_elements() -> None:
@ -285,10 +296,15 @@ instance's greatest bottleneck.
next_stage_url = None
elements.extend(page.get_child_elements())
if description_string := page.get_description():
description.append(description_string)
# Fill up our task list with the found elements
await gather_elements()
if description:
await self._download_description(PurePath("."), description[0])
elements.sort(key=lambda e: e.id())
tasks: List[Awaitable[None]] = []
@ -425,6 +441,19 @@ instance's greatest bottleneck.
return self._download_booking(element, link_template_maybe, maybe_dl)
@anoncritical
@_iorepeat(1, "downloading description")
async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None:
path = parent_path / "Description.html"
dl = await self.download(path, redownload=Redownload.ALWAYS)
if not dl:
return
async with dl as (bar, sink):
description = clean(insert_base_markup(description))
sink.file.write(description.prettify().encode("utf-8"))
sink.done()
@anoncritical
@_iorepeat(3, "resolving booking")
async def _download_booking(