mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Download page descriptions
This commit is contained in:
parent
a5015fe9b1
commit
846c29aee1
@ -22,6 +22,9 @@ ambiguous situations.
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Download of page descriptions
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Add `cpp` extension to default `link_regex` of IPD crawler
|
- Add `cpp` extension to default `link_regex` of IPD crawler
|
||||||
- Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option
|
- Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option
|
||||||
|
91
PFERD/crawl/ilias/ilias_html_cleaner.py
Normal file
91
PFERD/crawl/ilias/ilias_html_cleaner.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
from bs4 import BeautifulSoup, Comment, Tag
|
||||||
|
|
||||||
|
_STYLE_TAG_CONTENT = """
|
||||||
|
.ilc_text_block_Information {
|
||||||
|
background-color: #f5f7fa;
|
||||||
|
}
|
||||||
|
div.ilc_text_block_Standard {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
span.ilc_text_inline_Strong {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.accordion-head {
|
||||||
|
background-color: #f5f7fa;
|
||||||
|
padding: 0.5rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
h3 {
|
||||||
|
margin-top: 0.5rem;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
br.visible-break {
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
article {
|
||||||
|
margin: 0.5rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
padding: 1em;
|
||||||
|
grid-template-columns: 1fr min(60rem, 90%) 1fr;
|
||||||
|
line-height: 1.2;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
_ARTICLE_WORTHY_CLASSES = [
|
||||||
|
"ilc_text_block_Information",
|
||||||
|
"ilc_section_Attention",
|
||||||
|
"ilc_section_Link",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
|
||||||
|
head = soup.new_tag("head")
|
||||||
|
soup.insert(0, head)
|
||||||
|
|
||||||
|
simplecss_link: Tag = soup.new_tag("link")
|
||||||
|
# <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
|
||||||
|
simplecss_link["rel"] = "stylesheet"
|
||||||
|
simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
|
||||||
|
head.append(simplecss_link)
|
||||||
|
|
||||||
|
# Basic style tags for compat
|
||||||
|
style: Tag = soup.new_tag("style")
|
||||||
|
style.append(_STYLE_TAG_CONTENT)
|
||||||
|
head.append(style)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def clean(soup: BeautifulSoup) -> BeautifulSoup:
|
||||||
|
for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES):
|
||||||
|
block.name = "article"
|
||||||
|
|
||||||
|
for block in soup.find_all("h3"):
|
||||||
|
block.name = "div"
|
||||||
|
|
||||||
|
for block in soup.find_all("h1"):
|
||||||
|
block.name = "h3"
|
||||||
|
|
||||||
|
for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"):
|
||||||
|
block.name = "h3"
|
||||||
|
block["class"] += ["accordion-head"]
|
||||||
|
|
||||||
|
for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"):
|
||||||
|
children = list(dummy.children)
|
||||||
|
if not children:
|
||||||
|
dummy.decompose()
|
||||||
|
if len(children) > 1:
|
||||||
|
continue
|
||||||
|
if type(children[0]) == Comment:
|
||||||
|
dummy.decompose()
|
||||||
|
|
||||||
|
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
|
||||||
|
hrule_imposter.insert(0, soup.new_tag("hr"))
|
||||||
|
|
||||||
|
return soup
|
@ -85,6 +85,31 @@ class IliasPage:
|
|||||||
log.explain("Page is a normal folder, searching for elements")
|
log.explain("Page is a normal folder, searching for elements")
|
||||||
return self._find_normal_entries()
|
return self._find_normal_entries()
|
||||||
|
|
||||||
|
def get_description(self) -> Optional[BeautifulSoup]:
|
||||||
|
def is_interesting_class(name: str) -> bool:
|
||||||
|
return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
|
||||||
|
|
||||||
|
paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class)
|
||||||
|
if not paragraphs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Extract bits and pieces into a string and parse it again.
|
||||||
|
# This ensures we don't miss anything and weird structures are resolved
|
||||||
|
# somewhat gracefully.
|
||||||
|
raw_html = ""
|
||||||
|
for p in paragraphs:
|
||||||
|
if p.find_parent(class_=is_interesting_class):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ignore special listings (like folder groupings)
|
||||||
|
if "ilc_section_Special" in p["class"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_html += str(p) + "\n"
|
||||||
|
raw_html = f"<body>\n{raw_html}\n</body>"
|
||||||
|
|
||||||
|
return BeautifulSoup(raw_html, "html.parser")
|
||||||
|
|
||||||
def get_next_stage_element(self) -> Optional[IliasPageElement]:
|
def get_next_stage_element(self) -> Optional[IliasPageElement]:
|
||||||
if self._is_ilias_opencast_embedding():
|
if self._is_ilias_opencast_embedding():
|
||||||
return self.get_child_elements()[0]
|
return self.get_child_elements()[0]
|
||||||
|
@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param
|
|||||||
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
from .file_templates import Links
|
from .file_templates import Links
|
||||||
|
from .ilias_html_cleaner import clean, insert_base_markup
|
||||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
||||||
|
|
||||||
TargetType = Union[str, int]
|
TargetType = Union[str, int]
|
||||||
@ -215,6 +216,8 @@ instance's greatest bottleneck.
|
|||||||
cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
|
cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
|
||||||
|
|
||||||
elements: List[IliasPageElement] = []
|
elements: List[IliasPageElement] = []
|
||||||
|
# A list as variable redefinitions are not propagated to outer scopes
|
||||||
|
description: List[BeautifulSoup] = []
|
||||||
|
|
||||||
@_iorepeat(3, "crawling url")
|
@_iorepeat(3, "crawling url")
|
||||||
async def gather_elements() -> None:
|
async def gather_elements() -> None:
|
||||||
@ -233,9 +236,15 @@ instance's greatest bottleneck.
|
|||||||
page = IliasPage(soup, url, None)
|
page = IliasPage(soup, url, None)
|
||||||
elements.extend(page.get_child_elements())
|
elements.extend(page.get_child_elements())
|
||||||
|
|
||||||
|
if description_string := page.get_description():
|
||||||
|
description.append(description_string)
|
||||||
|
|
||||||
# Fill up our task list with the found elements
|
# Fill up our task list with the found elements
|
||||||
await gather_elements()
|
await gather_elements()
|
||||||
|
|
||||||
|
if description:
|
||||||
|
await self._download_description(PurePath("."), description[0])
|
||||||
|
|
||||||
elements.sort(key=lambda e: e.id())
|
elements.sort(key=lambda e: e.id())
|
||||||
|
|
||||||
tasks: List[Awaitable[None]] = []
|
tasks: List[Awaitable[None]] = []
|
||||||
@ -265,6 +274,8 @@ instance's greatest bottleneck.
|
|||||||
cl: CrawlToken,
|
cl: CrawlToken,
|
||||||
) -> None:
|
) -> None:
|
||||||
elements: List[IliasPageElement] = []
|
elements: List[IliasPageElement] = []
|
||||||
|
# A list as variable redefinitions are not propagated to outer scopes
|
||||||
|
description: List[BeautifulSoup] = []
|
||||||
|
|
||||||
@_iorepeat(3, "crawling folder")
|
@_iorepeat(3, "crawling folder")
|
||||||
async def gather_elements() -> None:
|
async def gather_elements() -> None:
|
||||||
@ -285,10 +296,15 @@ instance's greatest bottleneck.
|
|||||||
next_stage_url = None
|
next_stage_url = None
|
||||||
|
|
||||||
elements.extend(page.get_child_elements())
|
elements.extend(page.get_child_elements())
|
||||||
|
if description_string := page.get_description():
|
||||||
|
description.append(description_string)
|
||||||
|
|
||||||
# Fill up our task list with the found elements
|
# Fill up our task list with the found elements
|
||||||
await gather_elements()
|
await gather_elements()
|
||||||
|
|
||||||
|
if description:
|
||||||
|
await self._download_description(PurePath("."), description[0])
|
||||||
|
|
||||||
elements.sort(key=lambda e: e.id())
|
elements.sort(key=lambda e: e.id())
|
||||||
|
|
||||||
tasks: List[Awaitable[None]] = []
|
tasks: List[Awaitable[None]] = []
|
||||||
@ -425,6 +441,19 @@ instance's greatest bottleneck.
|
|||||||
|
|
||||||
return self._download_booking(element, link_template_maybe, maybe_dl)
|
return self._download_booking(element, link_template_maybe, maybe_dl)
|
||||||
|
|
||||||
|
@anoncritical
|
||||||
|
@_iorepeat(1, "downloading description")
|
||||||
|
async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None:
|
||||||
|
path = parent_path / "Description.html"
|
||||||
|
dl = await self.download(path, redownload=Redownload.ALWAYS)
|
||||||
|
if not dl:
|
||||||
|
return
|
||||||
|
|
||||||
|
async with dl as (bar, sink):
|
||||||
|
description = clean(insert_base_markup(description))
|
||||||
|
sink.file.write(description.prettify().encode("utf-8"))
|
||||||
|
sink.done()
|
||||||
|
|
||||||
@anoncritical
|
@anoncritical
|
||||||
@_iorepeat(3, "resolving booking")
|
@_iorepeat(3, "resolving booking")
|
||||||
async def _download_booking(
|
async def _download_booking(
|
||||||
|
Loading…
Reference in New Issue
Block a user