mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Fix IPD crawler for different subpages (#42)
This patch reworks the IPD crawler to support subpages which do not use "/intern" for links and fetches the folder names from table headings.
This commit is contained in:
parent
d6f38a61e1
commit
6b2a657573
@ -1,7 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import List, Set, Union
|
from re import Pattern
|
||||||
|
from typing import List, Set, Union, AnyStr, Optional
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -25,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
|
|||||||
|
|
||||||
return target
|
return target
|
||||||
|
|
||||||
|
def link_regex(self) -> Pattern[AnyStr]:
|
||||||
|
regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$")
|
||||||
|
return re.compile(regex)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class KitIpdFile:
|
class KitIpdFile:
|
||||||
@ -48,6 +54,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
):
|
):
|
||||||
super().__init__(name, section, config)
|
super().__init__(name, section, config)
|
||||||
self._url = section.target()
|
self._url = section.target()
|
||||||
|
self._file_regex = section.link_regex()
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
maybe_cl = await self.crawl(PurePath("."))
|
maybe_cl = await self.crawl(PurePath("."))
|
||||||
@ -88,17 +95,26 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
folder_tags: Set[Tag] = set()
|
folder_tags: Set[Tag] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
enclosing_data: Tag = element.findParent(name="td")
|
folder_label = self._fetch_folder_label(element)
|
||||||
label: Tag = enclosing_data.findPreviousSibling(name="td")
|
if folder_label is None:
|
||||||
folder_tags.add(label)
|
folder_tags.add(page)
|
||||||
|
else:
|
||||||
|
folder_tags.add(folder_label)
|
||||||
|
|
||||||
return folder_tags
|
return folder_tags
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||||
name = folder_tag.getText().strip()
|
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
|
# if files have found outside a regular table
|
||||||
|
if not folder_tag.name.startswith("h"):
|
||||||
|
name = "."
|
||||||
|
root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag))
|
||||||
|
for link in root_links:
|
||||||
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="td")
|
else:
|
||||||
|
name = folder_tag.getText().strip()
|
||||||
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
for link in self._find_file_links(container):
|
for link in self._find_file_links(container):
|
||||||
files.append(self._extract_file(link))
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
@ -108,14 +124,24 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fetch_folder_label(file_link: Tag) -> Optional[Tag]:
|
||||||
|
enclosing_table: Tag = file_link.findParent(name="table")
|
||||||
|
if enclosing_table is None:
|
||||||
|
return None
|
||||||
|
label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
|
if label is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return label
|
||||||
|
|
||||||
def _extract_file(self, link: Tag) -> KitIpdFile:
|
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||||
name = link.getText().strip()
|
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(link)
|
||||||
_, extension = os.path.splitext(url)
|
name = os.path.basename(url)
|
||||||
return KitIpdFile(name + extension, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})
|
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||||
|
|
||||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||||
return urljoin(self._url, link_tag.get("href"))
|
return urljoin(self._url, link_tag.get("href"))
|
||||||
|
Loading…
Reference in New Issue
Block a user