From 946b7a7931c8dc5c70edbc86e45d5d8e96b638a4 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 9 Feb 2021 12:30:59 +0100 Subject: [PATCH] Also crawl .c/.java/.zip from IPD page --- PFERD/ipd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PFERD/ipd.py b/PFERD/ipd.py index d602e0e..ece6a97 100644 --- a/PFERD/ipd.py +++ b/PFERD/ipd.py @@ -82,7 +82,10 @@ class IpdCrawler: items: List[IpdDownloadInfo] = [] - for link in page.findAll(name="a", attrs={"href": lambda x: x and x.endswith("pdf")}): + def is_relevant_url(x: str) -> bool: + return x.endswith(".pdf") or x.endswith(".c") or x.endswith(".java") or x.endswith(".zip") + + for link in page.findAll(name="a", attrs={"href": lambda x: x and is_relevant_url(x)}): href: str = link.attrs.get("href") name = href.split("/")[-1]