From bc3fa36637b5a4f4ea26db1a9437e4cbd5cad5c4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:20:45 +0200 Subject: [PATCH] Fix IPD crawler crashing on weird HTML comments --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5af29d..de7b795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +### Fixed +- IPD crawler crashes on some sites + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index e5ec58f..58e71f8 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -161,4 +161,10 @@ class KitIpdCrawler(HttpCrawler): async def get_page(self) -> BeautifulSoup: async with self.session.get(self._url) as request: - return soupify(await request.read()) + # The web page for Algorithmen für Routenplanung contains some + # weird comments that beautifulsoup doesn't parse correctly. This + # hack enables those pages to be crawled, and should hopefully not + # cause issues on other pages. + content = (await request.read()).decode("utf-8") + content = re.sub(r"", "", content) + return soupify(content.encode("utf-8"))