Fix IPD crawler crashing on weird HTML comments

This commit is contained in:
Joscha 2022-05-05 14:20:45 +02:00
parent afbd03f777
commit bc3fa36637
2 changed files with 10 additions and 1 deletions

View File

@ -25,6 +25,9 @@ ambiguous situations.
### Changed ### Changed
- Add `cpp` extension to default `link_regex` of IPD crawler - Add `cpp` extension to default `link_regex` of IPD crawler
### Fixed
- IPD crawler crashes on some sites
## 3.4.0 - 2022-05-01 ## 3.4.0 - 2022-05-01
### Added ### Added

View File

@ -161,4 +161,10 @@ class KitIpdCrawler(HttpCrawler):
async def get_page(self) -> BeautifulSoup: async def get_page(self) -> BeautifulSoup:
async with self.session.get(self._url) as request: async with self.session.get(self._url) as request:
return soupify(await request.read()) # The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This
# hack enables those pages to be crawled, and should hopefully not
# cause issues on other pages.
content = (await request.read()).decode("utf-8")
content = re.sub(r"<!--.*?-->", "", content)
return soupify(content.encode("utf-8"))