pferd/PFERD/crawl/kit_ipd_crawler.py

166 lines
5.4 KiB
Python
Raw Permalink Normal View History

2021-10-21 12:01:41 +02:00
import os
import re
2021-10-21 12:01:41 +02:00
from dataclasses import dataclass
from pathlib import PurePath
2022-11-24 07:53:52 +01:00
from typing import List, Optional, Pattern, Set, Tuple, Union
2021-10-21 12:01:41 +02:00
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag
from ..config import Config
from ..logging import ProgressBar, log
from ..output_dir import FileSink
from ..utils import soupify
from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection
class KitIpdCrawlerSection(HttpCrawlerSection):
def target(self) -> str:
target = self.s.get("target")
if not target:
self.missing_value("target")
if not target.startswith("https://"):
self.invalid_value("target", target, "Should be a URL")
return target
2021-11-01 10:43:13 +01:00
def link_regex(self) -> Pattern[str]:
2022-05-08 17:39:18 +02:00
regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$")
return re.compile(regex)
2021-10-21 12:01:41 +02:00
2021-11-01 10:43:13 +01:00
@dataclass(unsafe_hash=True)
2021-10-21 12:01:41 +02:00
class KitIpdFile:
name: str
url: str
@dataclass
class KitIpdFolder:
name: str
files: List[KitIpdFile]
2021-11-01 10:43:13 +01:00
def explain(self) -> None:
log.explain_topic(f"Folder {self.name!r}")
for file in self.files:
log.explain(f"File {file.name!r} (href={file.url!r})")
2021-11-01 10:43:13 +01:00
def __hash__(self) -> int:
return self.name.__hash__()
2021-10-21 12:01:41 +02:00
class KitIpdCrawler(HttpCrawler):
def __init__(
self,
name: str,
section: KitIpdCrawlerSection,
config: Config,
):
super().__init__(name, section, config)
self._url = section.target()
self._file_regex = section.link_regex()
2021-10-21 12:01:41 +02:00
async def _run(self) -> None:
2022-11-23 22:34:44 +01:00
cl = await self.crawl(PurePath("."))
if not cl:
2021-10-21 12:01:41 +02:00
return
2022-11-23 22:34:44 +01:00
async with cl:
2021-11-01 10:43:13 +01:00
for item in await self._fetch_items():
if isinstance(item, KitIpdFolder):
2022-11-23 22:34:44 +01:00
await self._crawl_folder(item)
2021-11-01 10:43:13 +01:00
else:
# Orphan files are placed in the root folder
2022-11-23 22:34:44 +01:00
await self._download_file(PurePath("."), item)
2021-10-21 12:01:41 +02:00
async def _crawl_folder(self, folder: KitIpdFolder) -> None:
path = PurePath(folder.name)
if not await self.crawl(path):
return
2022-11-23 22:34:44 +01:00
for file in folder.files:
await self._download_file(path, file)
2021-10-21 12:01:41 +02:00
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name
2022-11-23 22:34:44 +01:00
dl = await self.download(element_path)
if not dl:
2021-10-21 12:01:41 +02:00
return
2022-11-23 22:34:44 +01:00
async with dl as (bar, sink):
2021-10-21 12:01:41 +02:00
await self._stream_from_url(file.url, sink, bar)
2021-11-01 10:43:13 +01:00
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
2022-11-23 22:34:44 +01:00
page, url = await self._get_page()
2021-10-21 12:01:41 +02:00
elements: List[Tag] = self._find_file_links(page)
2021-11-01 10:43:13 +01:00
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
2021-10-21 12:01:41 +02:00
for element in elements:
2021-11-01 10:43:13 +01:00
folder_label = self._find_folder_label(element)
if folder_label:
folder = self._extract_folder(folder_label, url)
2021-11-01 10:43:13 +01:00
if folder not in items:
items.add(folder)
folder.explain()
else:
file = self._extract_file(element, url)
2021-11-01 10:43:13 +01:00
items.add(file)
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
2021-11-01 10:43:13 +01:00
log.explain("Attributing it to root folder")
2021-10-21 12:01:41 +02:00
2021-11-01 10:43:13 +01:00
return items
2021-10-21 12:01:41 +02:00
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
2021-10-21 12:01:41 +02:00
files: List[KitIpdFile] = []
2021-11-01 10:43:13 +01:00
name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container):
files.append(self._extract_file(link, url))
2021-10-21 12:01:41 +02:00
return KitIpdFolder(name, files)
@staticmethod
2021-11-01 10:43:13 +01:00
def _find_folder_label(file_link: Tag) -> Optional[Tag]:
enclosing_table: Tag = file_link.findParent(name="table")
if enclosing_table is None:
return None
2021-11-01 10:43:13 +01:00
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
url = self._abs_url_from_link(url, link)
name = os.path.basename(url)
return KitIpdFile(name, url)
2021-10-21 12:01:41 +02:00
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex})
2021-10-21 12:01:41 +02:00
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
return urljoin(url, link_tag.get("href"))
2021-10-21 12:01:41 +02:00
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp:
if resp.status == 403:
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
if resp.content_length:
bar.set_total(resp.content_length)
async for data in resp.content.iter_chunked(1024):
sink.file.write(data)
bar.advance(len(data))
sink.done()
2022-11-23 22:34:44 +01:00
async def _get_page(self) -> Tuple[BeautifulSoup, str]:
2022-11-24 15:47:01 +01:00
response = self.session.get(self._url)
# The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This
# hack enables those pages to be crawled, and should hopefully not
# cause issues on other pages.
content = re.sub(r"<!--.*?-->", "", response.text)
return soupify(content.encode("utf-8")), str(request.url)