Fix IPD crawler for different subpages (#42)

This patch reworks the IPD crawler to support subpages which do not use
"/intern" for links and fetches the folder names from table headings.
This commit is contained in:
Julius Rüberg 2021-11-01 10:09:50 +01:00 committed by Joscha
parent d6f38a61e1
commit 6b2a657573

View File

@ -1,7 +1,9 @@
import os import os
import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from typing import List, Set, Union from re import Pattern
from typing import List, Set, Union, AnyStr, Optional
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -25,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
return target return target
def link_regex(self) -> Pattern[AnyStr]:
regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$")
return re.compile(regex)
@dataclass @dataclass
class KitIpdFile: class KitIpdFile:
@ -48,6 +54,7 @@ class KitIpdCrawler(HttpCrawler):
): ):
super().__init__(name, section, config) super().__init__(name, section, config)
self._url = section.target() self._url = section.target()
self._file_regex = section.link_regex()
async def _run(self) -> None: async def _run(self) -> None:
maybe_cl = await self.crawl(PurePath(".")) maybe_cl = await self.crawl(PurePath("."))
@ -88,17 +95,26 @@ class KitIpdCrawler(HttpCrawler):
folder_tags: Set[Tag] = set() folder_tags: Set[Tag] = set()
for element in elements: for element in elements:
enclosing_data: Tag = element.findParent(name="td") folder_label = self._fetch_folder_label(element)
label: Tag = enclosing_data.findPreviousSibling(name="td") if folder_label is None:
folder_tags.add(label) folder_tags.add(page)
else:
folder_tags.add(folder_label)
return folder_tags return folder_tags
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
name = folder_tag.getText().strip()
files: List[KitIpdFile] = [] files: List[KitIpdFile] = []
# if files have found outside a regular table
if not folder_tag.name.startswith("h"):
name = "."
root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag))
for link in root_links:
files.append(self._extract_file(link))
container: Tag = folder_tag.findNextSibling(name="td") else:
name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container): for link in self._find_file_links(container):
files.append(self._extract_file(link)) files.append(self._extract_file(link))
@ -108,14 +124,24 @@ class KitIpdCrawler(HttpCrawler):
return KitIpdFolder(name, files) return KitIpdFolder(name, files)
@staticmethod
def _fetch_folder_label(file_link: Tag) -> Optional[Tag]:
enclosing_table: Tag = file_link.findParent(name="table")
if enclosing_table is None:
return None
label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
if label is None:
return None
else:
return label
def _extract_file(self, link: Tag) -> KitIpdFile: def _extract_file(self, link: Tag) -> KitIpdFile:
name = link.getText().strip()
url = self._abs_url_from_link(link) url = self._abs_url_from_link(link)
_, extension = os.path.splitext(url) name = os.path.basename(url)
return KitIpdFile(name + extension, url) return KitIpdFile(name, url)
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x}) return tag.findAll(name="a", attrs={"href": self._file_regex})
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href")) return urljoin(self._url, link_tag.get("href"))