mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Compare commits
12 Commits
update-che
...
v3.4.3
Author | SHA1 | Date | |
---|---|---|---|
6d44aac278 | |||
55a2de6b88 | |||
c0d6d8b229 | |||
635caa765d | |||
e69b55b349 | |||
07200bbde5 | |||
c020cccc64 | |||
259cfc20cc | |||
37b51a66d8 | |||
f47d2f11d8 | |||
1b6be6bd79 | |||
e1430e6298 |
23
CHANGELOG.md
23
CHANGELOG.md
@ -22,9 +22,30 @@ ambiguous situations.
|
||||
|
||||
## Unreleased
|
||||
|
||||
## 3.4.3 - 2022-11-29
|
||||
|
||||
### Added
|
||||
- Missing documentation for `forums` option
|
||||
|
||||
### Changed
|
||||
- Clear up error message shown when multiple paths are found to an element
|
||||
|
||||
### Fixed
|
||||
- Forum crawling crashing when parsing empty (= 0 messages) threads
|
||||
- IPD crawler unnecessarily appending trailing slashes
|
||||
- Crawling opencast when ILIAS is set to English
|
||||
|
||||
## 3.4.2 - 2022-10-26
|
||||
|
||||
### Added
|
||||
- Recognize and crawl content pages in cards
|
||||
- Recognize and ignore surveys
|
||||
|
||||
### Fixed
|
||||
- Forum crawling crashing when a thread has no messages at all
|
||||
- Forum crawling crashing when a forum has no threads at all
|
||||
- Ilias login failing in some cases
|
||||
- Crawling of paginated future meetings
|
||||
- IPD crawler handling of URLs without trailing slash
|
||||
|
||||
## 3.4.1 - 2022-08-17
|
||||
|
||||
|
@ -181,6 +181,7 @@ script once per day should be fine.
|
||||
redirect to the actual URL. Set to a negative value to disable the automatic
|
||||
redirect. (Default: `-1`)
|
||||
- `videos`: Whether to download videos. (Default: `no`)
|
||||
- `forums`: Whether to download forum threads. (Default: `no`)
|
||||
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
||||
`20.0`)
|
||||
|
||||
@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
|
||||
Example: `foo/bar --> baz`
|
||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||
- Converts `foo/bar` into `baz`
|
||||
- Converts `foo/bar/wargl` into `bar/wargl`
|
||||
- Converts `foo/bar/wargl` into `baz/wargl`
|
||||
|
||||
Example: `foo/bar --> !`
|
||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||
|
@ -5,8 +5,6 @@ import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from PFERD.update import check_for_updates
|
||||
|
||||
from .auth import AuthLoadError
|
||||
from .cli import PARSER, ParserLoadError, load_default_section
|
||||
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
||||
@ -136,11 +134,6 @@ def main() -> None:
|
||||
loop.run_until_complete(asyncio.sleep(1))
|
||||
loop.close()
|
||||
else:
|
||||
log.explain_topic("Checking for updates")
|
||||
if not args.skip_update_check:
|
||||
asyncio.run(check_for_updates())
|
||||
else:
|
||||
log.explain("Update check skipped due to configuration option")
|
||||
asyncio.run(pferd.run(args.debug_transforms))
|
||||
except (ConfigOptionError, AuthLoadError) as e:
|
||||
log.unlock()
|
||||
|
@ -151,11 +151,6 @@ PARSER.add_argument(
|
||||
action="version",
|
||||
version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
|
||||
)
|
||||
PARSER.add_argument(
|
||||
"--skip-update-check",
|
||||
action="store_true",
|
||||
help="disable automatic update checks at startup"
|
||||
)
|
||||
PARSER.add_argument(
|
||||
"--config", "-c",
|
||||
type=Path,
|
||||
|
@ -24,6 +24,7 @@ class IliasElementType(Enum):
|
||||
LINK = "link"
|
||||
BOOKING = "booking"
|
||||
MEETING = "meeting"
|
||||
SURVEY = "survey"
|
||||
VIDEO = "video"
|
||||
VIDEO_PLAYER = "video_player"
|
||||
VIDEO_FOLDER = "video_folder"
|
||||
@ -133,7 +134,7 @@ class IliasPage:
|
||||
|
||||
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
||||
|
||||
form_data: Dict[str, Union[str, List[ſtr]]] = {
|
||||
form_data: Dict[str, Union[str, List[str]]] = {
|
||||
"thread_ids[]": thread_ids,
|
||||
"selected_cmd2": "html",
|
||||
"select_cmd2": "Ausführen",
|
||||
@ -365,7 +366,7 @@ class IliasPage:
|
||||
"""
|
||||
# Video start links are marked with an "Abspielen" link
|
||||
video_links: List[Tag] = self._soup.findAll(
|
||||
name="a", text=re.compile(r"\s*Abspielen\s*")
|
||||
name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
|
||||
)
|
||||
|
||||
results: List[IliasPageElement] = []
|
||||
@ -730,6 +731,10 @@ class IliasPage:
|
||||
return IliasElementType.TEST
|
||||
if "fold" in icon["class"]:
|
||||
return IliasElementType.FOLDER
|
||||
if "copa" in icon["class"]:
|
||||
return IliasElementType.FOLDER
|
||||
if "svy" in icon["class"]:
|
||||
return IliasElementType.SURVEY
|
||||
|
||||
_unexpected_html_warning()
|
||||
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
||||
|
@ -194,7 +194,7 @@ instance's greatest bottleneck.
|
||||
self._links = section.links()
|
||||
self._videos = section.videos()
|
||||
self._forums = section.forums()
|
||||
self._visited_urls: Set[str] = set()
|
||||
self._visited_urls: Dict[str, PurePath] = dict()
|
||||
|
||||
async def _run(self) -> None:
|
||||
if isinstance(self._target, int):
|
||||
@ -348,9 +348,11 @@ instance's greatest bottleneck.
|
||||
) -> Optional[Coroutine[Any, Any, None]]:
|
||||
if element.url in self._visited_urls:
|
||||
raise CrawlWarning(
|
||||
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
|
||||
f"Found second path to element {element.name!r} at {element.url!r}. "
|
||||
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
|
||||
+ f"Second path: {fmt_path(parent_path)}."
|
||||
)
|
||||
self._visited_urls.add(element.url)
|
||||
self._visited_urls[element.url] = parent_path
|
||||
|
||||
element_path = PurePath(parent_path, element.name)
|
||||
|
||||
@ -377,9 +379,20 @@ instance's greatest bottleneck.
|
||||
return None
|
||||
return await self._handle_forum(element, element_path)
|
||||
elif element.type == IliasElementType.TEST:
|
||||
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
|
||||
log.explain("Tests contain no relevant files")
|
||||
log.explain("Answer: No")
|
||||
log.status(
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](tests contain no relevant data)"
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.SURVEY:
|
||||
log.status(
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](surveys contain no relevant data)"
|
||||
)
|
||||
return None
|
||||
elif element.type == IliasElementType.LINK:
|
||||
return await self._handle_link(element, element_path)
|
||||
|
@ -2,7 +2,7 @@ import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import PurePath
|
||||
from typing import Awaitable, List, Optional, Pattern, Set, Union
|
||||
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
|
||||
await self._stream_from_url(file.url, sink, bar)
|
||||
|
||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||
page = await self.get_page()
|
||||
page, url = await self.get_page()
|
||||
elements: List[Tag] = self._find_file_links(page)
|
||||
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||
|
||||
for element in elements:
|
||||
folder_label = self._find_folder_label(element)
|
||||
if folder_label:
|
||||
folder = self._extract_folder(folder_label)
|
||||
folder = self._extract_folder(folder_label, url)
|
||||
if folder not in items:
|
||||
items.add(folder)
|
||||
folder.explain()
|
||||
else:
|
||||
file = self._extract_file(element)
|
||||
file = self._extract_file(element, url)
|
||||
items.add(file)
|
||||
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
||||
log.explain("Attributing it to root folder")
|
||||
|
||||
return items
|
||||
|
||||
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
|
||||
files: List[KitIpdFile] = []
|
||||
name = folder_tag.getText().strip()
|
||||
|
||||
container: Tag = folder_tag.findNextSibling(name="table")
|
||||
for link in self._find_file_links(container):
|
||||
files.append(self._extract_file(link))
|
||||
files.append(self._extract_file(link, url))
|
||||
|
||||
return KitIpdFolder(name, files)
|
||||
|
||||
@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
|
||||
return None
|
||||
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||
|
||||
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||
url = self._abs_url_from_link(link)
|
||||
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
|
||||
url = self._abs_url_from_link(url, link)
|
||||
name = os.path.basename(url)
|
||||
return KitIpdFile(name, url)
|
||||
|
||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||
|
||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||
return urljoin(self._url, link_tag.get("href"))
|
||||
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
||||
return urljoin(url, link_tag.get("href"))
|
||||
|
||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||
async with self.session.get(url, allow_redirects=False) as resp:
|
||||
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
|
||||
|
||||
sink.done()
|
||||
|
||||
async def get_page(self) -> BeautifulSoup:
|
||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||
async with self.session.get(self._url) as request:
|
||||
# The web page for Algorithmen für Routenplanung contains some
|
||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
|
||||
# cause issues on other pages.
|
||||
content = (await request.read()).decode("utf-8")
|
||||
content = re.sub(r"<!--.*?-->", "", content)
|
||||
return soupify(content.encode("utf-8"))
|
||||
return soupify(content.encode("utf-8")), str(request.url)
|
||||
|
@ -1,53 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
import ssl
|
||||
from typing import Optional
|
||||
import aiohttp
|
||||
import certifi
|
||||
|
||||
from .version import NAME, VERSION
|
||||
from .logging import log
|
||||
|
||||
|
||||
@dataclass
|
||||
class PferdUpdate:
|
||||
release_url: str
|
||||
version: str
|
||||
|
||||
|
||||
def _build_session() -> aiohttp.ClientSession:
|
||||
return aiohttp.ClientSession(
|
||||
headers={"User-Agent": f"{NAME}/{VERSION}"},
|
||||
connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
|
||||
timeout=aiohttp.ClientTimeout(
|
||||
total=15 * 60,
|
||||
connect=10,
|
||||
sock_connect=10,
|
||||
sock_read=10,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def check_for_updates() -> None:
|
||||
if new_version := await get_newer_version():
|
||||
log.warn(
|
||||
f"{NAME} version out of date. "
|
||||
+ f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
|
||||
)
|
||||
log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
|
||||
else:
|
||||
log.explain("No update found")
|
||||
|
||||
|
||||
async def get_newer_version() -> Optional[PferdUpdate]:
|
||||
async with _build_session() as session:
|
||||
async with session.get(
|
||||
"https://api.github.com/repos/Garmelon/Pferd/releases/latest",
|
||||
headers={"Accept": "application/vnd.github+json"}
|
||||
) as response:
|
||||
release_information = await response.json()
|
||||
tag_name: str = release_information["tag_name"]
|
||||
tag_name = tag_name.removeprefix("v")
|
||||
if VERSION == tag_name:
|
||||
return None
|
||||
|
||||
return PferdUpdate(release_url=release_information["html_url"], version=tag_name)
|
@ -1,2 +1,2 @@
|
||||
NAME = "PFERD"
|
||||
VERSION = "3.4.1"
|
||||
VERSION = "3.4.3"
|
||||
|
@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
|
||||
|
||||
Unofficial packages are available for:
|
||||
- [AUR](https://aur.archlinux.org/packages/pferd)
|
||||
- [brew](https://formulae.brew.sh/formula/pferd)
|
||||
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
|
||||
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
||||
- [PyPi](https://pypi.org/project/pferd)
|
||||
|
||||
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
||||
|
||||
|
Reference in New Issue
Block a user