mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
TODO
This commit is contained in:
parent
5adfdfbd2b
commit
bf27f4a686
@ -155,11 +155,11 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def _get_page(self) -> Tuple[BeautifulSoup, str]:
|
async def _get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||||
async with self.session.get(self._url) as request:
|
response = self.session.get(self._url)
|
||||||
|
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
# hack enables those pages to be crawled, and should hopefully not
|
# hack enables those pages to be crawled, and should hopefully not
|
||||||
# cause issues on other pages.
|
# cause issues on other pages.
|
||||||
content = (await request.read()).decode("utf-8")
|
content = re.sub(r"<!--.*?-->", "", response.text)
|
||||||
content = re.sub(r"<!--.*?-->", "", content)
|
|
||||||
return soupify(content.encode("utf-8")), str(request.url)
|
return soupify(content.encode("utf-8")), str(request.url)
|
||||||
|
Loading…
Reference in New Issue
Block a user