mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Download page descriptions
This commit is contained in:
91
PFERD/crawl/ilias/ilias_html_cleaner.py
Normal file
91
PFERD/crawl/ilias/ilias_html_cleaner.py
Normal file
@ -0,0 +1,91 @@
|
||||
from bs4 import BeautifulSoup, Comment, Tag
|
||||
|
||||
_STYLE_TAG_CONTENT = """
|
||||
.ilc_text_block_Information {
|
||||
background-color: #f5f7fa;
|
||||
}
|
||||
div.ilc_text_block_Standard {
|
||||
margin-bottom: 10px;
|
||||
margin-top: 10px;
|
||||
}
|
||||
span.ilc_text_inline_Strong {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.accordion-head {
|
||||
background-color: #f5f7fa;
|
||||
padding: 0.5rem 0;
|
||||
}
|
||||
|
||||
h3 {
|
||||
margin-top: 0.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
br.visible-break {
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
article {
|
||||
margin: 0.5rem 0;
|
||||
}
|
||||
|
||||
body {
|
||||
padding: 1em;
|
||||
grid-template-columns: 1fr min(60rem, 90%) 1fr;
|
||||
line-height: 1.2;
|
||||
}
|
||||
"""
|
||||
|
||||
_ARTICLE_WORTHY_CLASSES = [
|
||||
"ilc_text_block_Information",
|
||||
"ilc_section_Attention",
|
||||
"ilc_section_Link",
|
||||
]
|
||||
|
||||
|
||||
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
|
||||
head = soup.new_tag("head")
|
||||
soup.insert(0, head)
|
||||
|
||||
simplecss_link: Tag = soup.new_tag("link")
|
||||
# <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
|
||||
simplecss_link["rel"] = "stylesheet"
|
||||
simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
|
||||
head.append(simplecss_link)
|
||||
|
||||
# Basic style tags for compat
|
||||
style: Tag = soup.new_tag("style")
|
||||
style.append(_STYLE_TAG_CONTENT)
|
||||
head.append(style)
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def clean(soup: BeautifulSoup) -> BeautifulSoup:
|
||||
for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES):
|
||||
block.name = "article"
|
||||
|
||||
for block in soup.find_all("h3"):
|
||||
block.name = "div"
|
||||
|
||||
for block in soup.find_all("h1"):
|
||||
block.name = "h3"
|
||||
|
||||
for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"):
|
||||
block.name = "h3"
|
||||
block["class"] += ["accordion-head"]
|
||||
|
||||
for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"):
|
||||
children = list(dummy.children)
|
||||
if not children:
|
||||
dummy.decompose()
|
||||
if len(children) > 1:
|
||||
continue
|
||||
if type(children[0]) == Comment:
|
||||
dummy.decompose()
|
||||
|
||||
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
|
||||
hrule_imposter.insert(0, soup.new_tag("hr"))
|
||||
|
||||
return soup
|
Reference in New Issue
Block a user