Use url after redirect for relative links

This commit is contained in:
Joscha 2022-11-21 17:53:30 +01:00
parent 635caa765d
commit c0d6d8b229
2 changed files with 15 additions and 15 deletions

View File

@ -28,6 +28,9 @@ ambiguous situations.
### Changed ### Changed
- Clear up error message shown when multiple paths are found to an element - Clear up error message shown when multiple paths are found to an element
### Fixed
- IPD crawler unnecessarily appending trailing slashes
## 3.4.2 - 2022-10-26 ## 3.4.2 - 2022-10-26
### Added ### Added

View File

@ -2,7 +2,7 @@ import os
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from typing import Awaitable, List, Optional, Pattern, Set, Union from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -24,9 +24,6 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
if not target.startswith("https://"): if not target.startswith("https://"):
self.invalid_value("target", target, "Should be a URL") self.invalid_value("target", target, "Should be a URL")
if not target.endswith("/"):
target = target + "/"
return target return target
def link_regex(self) -> Pattern[str]: def link_regex(self) -> Pattern[str]:
@ -102,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
await self._stream_from_url(file.url, sink, bar) await self._stream_from_url(file.url, sink, bar)
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
page = await self.get_page() page, url = await self.get_page()
elements: List[Tag] = self._find_file_links(page) elements: List[Tag] = self._find_file_links(page)
items: Set[Union[KitIpdFile, KitIpdFolder]] = set() items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
for element in elements: for element in elements:
folder_label = self._find_folder_label(element) folder_label = self._find_folder_label(element)
if folder_label: if folder_label:
folder = self._extract_folder(folder_label) folder = self._extract_folder(folder_label, url)
if folder not in items: if folder not in items:
items.add(folder) items.add(folder)
folder.explain() folder.explain()
else: else:
file = self._extract_file(element) file = self._extract_file(element, url)
items.add(file) items.add(file)
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
log.explain("Attributing it to root folder") log.explain("Attributing it to root folder")
return items return items
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
files: List[KitIpdFile] = [] files: List[KitIpdFile] = []
name = folder_tag.getText().strip() name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table") container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container): for link in self._find_file_links(container):
files.append(self._extract_file(link)) files.append(self._extract_file(link, url))
return KitIpdFolder(name, files) return KitIpdFolder(name, files)
@ -138,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
return None return None
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
def _extract_file(self, link: Tag) -> KitIpdFile: def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
url = self._abs_url_from_link(link) url = self._abs_url_from_link(url, link)
name = os.path.basename(url) name = os.path.basename(url)
return KitIpdFile(name, url) return KitIpdFile(name, url)
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex}) return tag.findAll(name="a", attrs={"href": self._file_regex})
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href")) return urljoin(url, link_tag.get("href"))
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp: async with self.session.get(url, allow_redirects=False) as resp:
@ -162,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
sink.done() sink.done()
async def get_page(self) -> BeautifulSoup: async def get_page(self) -> Tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request: async with self.session.get(self._url) as request:
# The web page for Algorithmen für Routenplanung contains some # The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This # weird comments that beautifulsoup doesn't parse correctly. This
@ -170,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
# cause issues on other pages. # cause issues on other pages.
content = (await request.read()).decode("utf-8") content = (await request.read()).decode("utf-8")
content = re.sub(r"<!--.*?-->", "", content) content = re.sub(r"<!--.*?-->", "", content)
return soupify(content.encode("utf-8")) return soupify(content.encode("utf-8")), str(request.url)