This commit is contained in:
Joscha 2022-11-24 15:47:01 +01:00
parent 5adfdfbd2b
commit bf27f4a686

View File

@ -155,11 +155,11 @@ class KitIpdCrawler(HttpCrawler):
sink.done() sink.done()
async def _get_page(self) -> Tuple[BeautifulSoup, str]: async def _get_page(self) -> Tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request: response = self.session.get(self._url)
# The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This # The web page for Algorithmen für Routenplanung contains some
# hack enables those pages to be crawled, and should hopefully not # weird comments that beautifulsoup doesn't parse correctly. This
# cause issues on other pages. # hack enables those pages to be crawled, and should hopefully not
content = (await request.read()).decode("utf-8") # cause issues on other pages.
content = re.sub(r"<!--.*?-->", "", content) content = re.sub(r"<!--.*?-->", "", response.text)
return soupify(content.encode("utf-8")), str(request.url) return soupify(content.encode("utf-8")), str(request.url)