mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Use url after redirect for relative links
This commit is contained in:
parent
635caa765d
commit
c0d6d8b229
@ -28,6 +28,9 @@ ambiguous situations.
|
|||||||
### Changed
|
### Changed
|
||||||
- Clear up error message shown when multiple paths are found to an element
|
- Clear up error message shown when multiple paths are found to an element
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- IPD crawler unnecessarily appending trailing slashes
|
||||||
|
|
||||||
## 3.4.2 - 2022-10-26
|
## 3.4.2 - 2022-10-26
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
@ -2,7 +2,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Awaitable, List, Optional, Pattern, Set, Union
|
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -24,9 +24,6 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
|
|||||||
if not target.startswith("https://"):
|
if not target.startswith("https://"):
|
||||||
self.invalid_value("target", target, "Should be a URL")
|
self.invalid_value("target", target, "Should be a URL")
|
||||||
|
|
||||||
if not target.endswith("/"):
|
|
||||||
target = target + "/"
|
|
||||||
|
|
||||||
return target
|
return target
|
||||||
|
|
||||||
def link_regex(self) -> Pattern[str]:
|
def link_regex(self) -> Pattern[str]:
|
||||||
@ -102,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
await self._stream_from_url(file.url, sink, bar)
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||||
page = await self.get_page()
|
page, url = await self.get_page()
|
||||||
elements: List[Tag] = self._find_file_links(page)
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
folder_label = self._find_folder_label(element)
|
folder_label = self._find_folder_label(element)
|
||||||
if folder_label:
|
if folder_label:
|
||||||
folder = self._extract_folder(folder_label)
|
folder = self._extract_folder(folder_label, url)
|
||||||
if folder not in items:
|
if folder not in items:
|
||||||
items.add(folder)
|
items.add(folder)
|
||||||
folder.explain()
|
folder.explain()
|
||||||
else:
|
else:
|
||||||
file = self._extract_file(element)
|
file = self._extract_file(element, url)
|
||||||
items.add(file)
|
items.add(file)
|
||||||
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
||||||
log.explain("Attributing it to root folder")
|
log.explain("Attributing it to root folder")
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
name = folder_tag.getText().strip()
|
name = folder_tag.getText().strip()
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="table")
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
for link in self._find_file_links(container):
|
for link in self._find_file_links(container):
|
||||||
files.append(self._extract_file(link))
|
files.append(self._extract_file(link, url))
|
||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
@ -138,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
return None
|
return None
|
||||||
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
|
|
||||||
def _extract_file(self, link: Tag) -> KitIpdFile:
|
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(url, link)
|
||||||
name = os.path.basename(url)
|
name = os.path.basename(url)
|
||||||
return KitIpdFile(name, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||||
|
|
||||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
||||||
return urljoin(self._url, link_tag.get("href"))
|
return urljoin(url, link_tag.get("href"))
|
||||||
|
|
||||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
async with self.session.get(url, allow_redirects=False) as resp:
|
async with self.session.get(url, allow_redirects=False) as resp:
|
||||||
@ -162,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def get_page(self) -> BeautifulSoup:
|
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
@ -170,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
# cause issues on other pages.
|
# cause issues on other pages.
|
||||||
content = (await request.read()).decode("utf-8")
|
content = (await request.read()).decode("utf-8")
|
||||||
content = re.sub(r"<!--.*?-->", "", content)
|
content = re.sub(r"<!--.*?-->", "", content)
|
||||||
return soupify(content.encode("utf-8"))
|
return soupify(content.encode("utf-8")), str(request.url)
|
||||||
|
Loading…
Reference in New Issue
Block a user