Refactor IPD crawler a bit

This commit is contained in:
I-Al-Istannen 2021-11-01 10:43:13 +01:00 committed by Joscha
parent 6b2a657573
commit 88afe64a92
2 changed files with 39 additions and 38 deletions

View File

@ -35,7 +35,7 @@ def load(
log.explain("Creating config for command 'kit-ipd'") log.explain("Creating config for command 'kit-ipd'")
parser["crawl:kit-ipd"] = {} parser["crawl:kit-ipd"] = {}
section = parser["crawl:ipd"] section = parser["crawl:kit-ipd"]
load_crawler(args, section) load_crawler(args, section)
section["type"] = "kit-ipd" section["type"] = "kit-ipd"

View File

@ -3,7 +3,7 @@ import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from re import Pattern from re import Pattern
from typing import List, Set, Union, AnyStr, Optional from typing import Awaitable, List, Optional, Set, Union
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -27,12 +27,12 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
return target return target
def link_regex(self) -> Pattern[AnyStr]: def link_regex(self) -> Pattern[str]:
regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$") regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$")
return re.compile(regex) return re.compile(regex)
@dataclass @dataclass(unsafe_hash=True)
class KitIpdFile: class KitIpdFile:
name: str name: str
url: str url: str
@ -43,6 +43,14 @@ class KitIpdFolder:
name: str name: str
files: List[KitIpdFile] files: List[KitIpdFile]
def explain(self) -> None:
log.explain_topic(f"Folder {self.name!r}")
for file in self.files:
log.explain(f"File {file.name!r}")
def __hash__(self) -> int:
return self.name.__hash__()
class KitIpdCrawler(HttpCrawler): class KitIpdCrawler(HttpCrawler):
@ -61,13 +69,15 @@ class KitIpdCrawler(HttpCrawler):
if not maybe_cl: if not maybe_cl:
return return
folders: List[KitIpdFolder] = [] tasks: List[Awaitable[None]] = []
async with maybe_cl: async with maybe_cl:
folder_tags = await self._fetch_folder_tags() for item in await self._fetch_items():
folders = [self._extract_folder(tag) for tag in folder_tags] if isinstance(item, KitIpdFolder):
tasks.append(self._crawl_folder(item))
tasks = [self._crawl_folder(folder) for folder in folders] else:
# Orphan files are placed in the root folder
tasks.append(self._download_file(PurePath("."), item))
await self.gather(tasks) await self.gather(tasks)
@ -89,51 +99,42 @@ class KitIpdCrawler(HttpCrawler):
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
await self._stream_from_url(file.url, sink, bar) await self._stream_from_url(file.url, sink, bar)
async def _fetch_folder_tags(self) -> Set[Tag]: async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
page = await self.get_page() page = await self.get_page()
elements: List[Tag] = self._find_file_links(page) elements: List[Tag] = self._find_file_links(page)
folder_tags: Set[Tag] = set() items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
for element in elements: for element in elements:
folder_label = self._fetch_folder_label(element) folder_label = self._find_folder_label(element)
if folder_label is None: if folder_label:
folder_tags.add(page) folder = self._extract_folder(folder_label)
if folder not in items:
items.add(folder)
folder.explain()
else: else:
folder_tags.add(folder_label) file = self._extract_file(element)
items.add(file)
log.explain_topic(f"Orphan file {file.name!r}")
log.explain("Attributing it to root folder")
return folder_tags return items
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
files: List[KitIpdFile] = [] files: List[KitIpdFile] = []
# if files have found outside a regular table
if not folder_tag.name.startswith("h"):
name = "."
root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag))
for link in root_links:
files.append(self._extract_file(link))
else:
name = folder_tag.getText().strip() name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table") container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container): for link in self._find_file_links(container):
files.append(self._extract_file(link)) files.append(self._extract_file(link))
log.explain_topic(f"Found folder {name!r}")
for file in files:
log.explain(f"Found file {file.name!r}")
return KitIpdFolder(name, files) return KitIpdFolder(name, files)
@staticmethod @staticmethod
def _fetch_folder_label(file_link: Tag) -> Optional[Tag]: def _find_folder_label(file_link: Tag) -> Optional[Tag]:
enclosing_table: Tag = file_link.findParent(name="table") enclosing_table: Tag = file_link.findParent(name="table")
if enclosing_table is None: if enclosing_table is None:
return None return None
label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
if label is None:
return None
else:
return label
def _extract_file(self, link: Tag) -> KitIpdFile: def _extract_file(self, link: Tag) -> KitIpdFile:
url = self._abs_url_from_link(link) url = self._abs_url_from_link(link)