mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Refactor IPD crawler a bit
This commit is contained in:
parent
6b2a657573
commit
88afe64a92
@ -35,7 +35,7 @@ def load(
|
|||||||
log.explain("Creating config for command 'kit-ipd'")
|
log.explain("Creating config for command 'kit-ipd'")
|
||||||
|
|
||||||
parser["crawl:kit-ipd"] = {}
|
parser["crawl:kit-ipd"] = {}
|
||||||
section = parser["crawl:ipd"]
|
section = parser["crawl:kit-ipd"]
|
||||||
load_crawler(args, section)
|
load_crawler(args, section)
|
||||||
|
|
||||||
section["type"] = "kit-ipd"
|
section["type"] = "kit-ipd"
|
||||||
|
@ -3,7 +3,7 @@ import re
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from re import Pattern
|
from re import Pattern
|
||||||
from typing import List, Set, Union, AnyStr, Optional
|
from typing import Awaitable, List, Optional, Set, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -27,12 +27,12 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
|
|||||||
|
|
||||||
return target
|
return target
|
||||||
|
|
||||||
def link_regex(self) -> Pattern[AnyStr]:
|
def link_regex(self) -> Pattern[str]:
|
||||||
regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$")
|
regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$")
|
||||||
return re.compile(regex)
|
return re.compile(regex)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(unsafe_hash=True)
|
||||||
class KitIpdFile:
|
class KitIpdFile:
|
||||||
name: str
|
name: str
|
||||||
url: str
|
url: str
|
||||||
@ -43,6 +43,14 @@ class KitIpdFolder:
|
|||||||
name: str
|
name: str
|
||||||
files: List[KitIpdFile]
|
files: List[KitIpdFile]
|
||||||
|
|
||||||
|
def explain(self) -> None:
|
||||||
|
log.explain_topic(f"Folder {self.name!r}")
|
||||||
|
for file in self.files:
|
||||||
|
log.explain(f"File {file.name!r}")
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return self.name.__hash__()
|
||||||
|
|
||||||
|
|
||||||
class KitIpdCrawler(HttpCrawler):
|
class KitIpdCrawler(HttpCrawler):
|
||||||
|
|
||||||
@ -61,13 +69,15 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
if not maybe_cl:
|
if not maybe_cl:
|
||||||
return
|
return
|
||||||
|
|
||||||
folders: List[KitIpdFolder] = []
|
tasks: List[Awaitable[None]] = []
|
||||||
|
|
||||||
async with maybe_cl:
|
async with maybe_cl:
|
||||||
folder_tags = await self._fetch_folder_tags()
|
for item in await self._fetch_items():
|
||||||
folders = [self._extract_folder(tag) for tag in folder_tags]
|
if isinstance(item, KitIpdFolder):
|
||||||
|
tasks.append(self._crawl_folder(item))
|
||||||
tasks = [self._crawl_folder(folder) for folder in folders]
|
else:
|
||||||
|
# Orphan files are placed in the root folder
|
||||||
|
tasks.append(self._download_file(PurePath("."), item))
|
||||||
|
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
@ -89,51 +99,42 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
async with maybe_dl as (bar, sink):
|
async with maybe_dl as (bar, sink):
|
||||||
await self._stream_from_url(file.url, sink, bar)
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
async def _fetch_folder_tags(self) -> Set[Tag]:
|
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||||
page = await self.get_page()
|
page = await self.get_page()
|
||||||
elements: List[Tag] = self._find_file_links(page)
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
folder_tags: Set[Tag] = set()
|
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
folder_label = self._fetch_folder_label(element)
|
folder_label = self._find_folder_label(element)
|
||||||
if folder_label is None:
|
if folder_label:
|
||||||
folder_tags.add(page)
|
folder = self._extract_folder(folder_label)
|
||||||
|
if folder not in items:
|
||||||
|
items.add(folder)
|
||||||
|
folder.explain()
|
||||||
else:
|
else:
|
||||||
folder_tags.add(folder_label)
|
file = self._extract_file(element)
|
||||||
|
items.add(file)
|
||||||
|
log.explain_topic(f"Orphan file {file.name!r}")
|
||||||
|
log.explain("Attributing it to root folder")
|
||||||
|
|
||||||
return folder_tags
|
return items
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
# if files have found outside a regular table
|
|
||||||
if not folder_tag.name.startswith("h"):
|
|
||||||
name = "."
|
|
||||||
root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag))
|
|
||||||
for link in root_links:
|
|
||||||
files.append(self._extract_file(link))
|
|
||||||
|
|
||||||
else:
|
|
||||||
name = folder_tag.getText().strip()
|
name = folder_tag.getText().strip()
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="table")
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
for link in self._find_file_links(container):
|
for link in self._find_file_links(container):
|
||||||
files.append(self._extract_file(link))
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
log.explain_topic(f"Found folder {name!r}")
|
|
||||||
for file in files:
|
|
||||||
log.explain(f"Found file {file.name!r}")
|
|
||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _fetch_folder_label(file_link: Tag) -> Optional[Tag]:
|
def _find_folder_label(file_link: Tag) -> Optional[Tag]:
|
||||||
enclosing_table: Tag = file_link.findParent(name="table")
|
enclosing_table: Tag = file_link.findParent(name="table")
|
||||||
if enclosing_table is None:
|
if enclosing_table is None:
|
||||||
return None
|
return None
|
||||||
label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
if label is None:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return label
|
|
||||||
|
|
||||||
def _extract_file(self, link: Tag) -> KitIpdFile:
|
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(link)
|
||||||
|
Loading…
Reference in New Issue
Block a user