mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Compare commits
1 Commits
v3.4.3
...
update-che
Author | SHA1 | Date | |
---|---|---|---|
2d145e7c94 |
23
CHANGELOG.md
23
CHANGELOG.md
@ -22,30 +22,9 @@ ambiguous situations.
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
## 3.4.3 - 2022-11-29
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Missing documentation for `forums` option
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Clear up error message shown when multiple paths are found to an element
|
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- IPD crawler unnecessarily appending trailing slashes
|
- Forum crawling crashing when parsing empty (= 0 messages) threads
|
||||||
- Crawling opencast when ILIAS is set to English
|
|
||||||
|
|
||||||
## 3.4.2 - 2022-10-26
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Recognize and crawl content pages in cards
|
|
||||||
- Recognize and ignore surveys
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Forum crawling crashing when a thread has no messages at all
|
|
||||||
- Forum crawling crashing when a forum has no threads at all
|
- Forum crawling crashing when a forum has no threads at all
|
||||||
- Ilias login failing in some cases
|
|
||||||
- Crawling of paginated future meetings
|
|
||||||
- IPD crawler handling of URLs without trailing slash
|
|
||||||
|
|
||||||
## 3.4.1 - 2022-08-17
|
## 3.4.1 - 2022-08-17
|
||||||
|
|
||||||
|
@ -181,7 +181,6 @@ script once per day should be fine.
|
|||||||
redirect to the actual URL. Set to a negative value to disable the automatic
|
redirect to the actual URL. Set to a negative value to disable the automatic
|
||||||
redirect. (Default: `-1`)
|
redirect. (Default: `-1`)
|
||||||
- `videos`: Whether to download videos. (Default: `no`)
|
- `videos`: Whether to download videos. (Default: `no`)
|
||||||
- `forums`: Whether to download forum threads. (Default: `no`)
|
|
||||||
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
||||||
`20.0`)
|
`20.0`)
|
||||||
|
|
||||||
@ -290,7 +289,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
|
|||||||
Example: `foo/bar --> baz`
|
Example: `foo/bar --> baz`
|
||||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||||
- Converts `foo/bar` into `baz`
|
- Converts `foo/bar` into `baz`
|
||||||
- Converts `foo/bar/wargl` into `baz/wargl`
|
- Converts `foo/bar/wargl` into `bar/wargl`
|
||||||
|
|
||||||
Example: `foo/bar --> !`
|
Example: `foo/bar --> !`
|
||||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||||
|
@ -5,6 +5,8 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PFERD.update import check_for_updates
|
||||||
|
|
||||||
from .auth import AuthLoadError
|
from .auth import AuthLoadError
|
||||||
from .cli import PARSER, ParserLoadError, load_default_section
|
from .cli import PARSER, ParserLoadError, load_default_section
|
||||||
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
||||||
@ -134,6 +136,11 @@ def main() -> None:
|
|||||||
loop.run_until_complete(asyncio.sleep(1))
|
loop.run_until_complete(asyncio.sleep(1))
|
||||||
loop.close()
|
loop.close()
|
||||||
else:
|
else:
|
||||||
|
log.explain_topic("Checking for updates")
|
||||||
|
if not args.skip_update_check:
|
||||||
|
asyncio.run(check_for_updates())
|
||||||
|
else:
|
||||||
|
log.explain("Update check skipped due to configuration option")
|
||||||
asyncio.run(pferd.run(args.debug_transforms))
|
asyncio.run(pferd.run(args.debug_transforms))
|
||||||
except (ConfigOptionError, AuthLoadError) as e:
|
except (ConfigOptionError, AuthLoadError) as e:
|
||||||
log.unlock()
|
log.unlock()
|
||||||
|
@ -151,6 +151,11 @@ PARSER.add_argument(
|
|||||||
action="version",
|
action="version",
|
||||||
version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
|
version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
|
||||||
)
|
)
|
||||||
|
PARSER.add_argument(
|
||||||
|
"--skip-update-check",
|
||||||
|
action="store_true",
|
||||||
|
help="disable automatic update checks at startup"
|
||||||
|
)
|
||||||
PARSER.add_argument(
|
PARSER.add_argument(
|
||||||
"--config", "-c",
|
"--config", "-c",
|
||||||
type=Path,
|
type=Path,
|
||||||
|
@ -24,7 +24,6 @@ class IliasElementType(Enum):
|
|||||||
LINK = "link"
|
LINK = "link"
|
||||||
BOOKING = "booking"
|
BOOKING = "booking"
|
||||||
MEETING = "meeting"
|
MEETING = "meeting"
|
||||||
SURVEY = "survey"
|
|
||||||
VIDEO = "video"
|
VIDEO = "video"
|
||||||
VIDEO_PLAYER = "video_player"
|
VIDEO_PLAYER = "video_player"
|
||||||
VIDEO_FOLDER = "video_folder"
|
VIDEO_FOLDER = "video_folder"
|
||||||
@ -134,7 +133,7 @@ class IliasPage:
|
|||||||
|
|
||||||
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
||||||
|
|
||||||
form_data: Dict[str, Union[str, List[str]]] = {
|
form_data: Dict[str, Union[str, List[ſtr]]] = {
|
||||||
"thread_ids[]": thread_ids,
|
"thread_ids[]": thread_ids,
|
||||||
"selected_cmd2": "html",
|
"selected_cmd2": "html",
|
||||||
"select_cmd2": "Ausführen",
|
"select_cmd2": "Ausführen",
|
||||||
@ -366,7 +365,7 @@ class IliasPage:
|
|||||||
"""
|
"""
|
||||||
# Video start links are marked with an "Abspielen" link
|
# Video start links are marked with an "Abspielen" link
|
||||||
video_links: List[Tag] = self._soup.findAll(
|
video_links: List[Tag] = self._soup.findAll(
|
||||||
name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
|
name="a", text=re.compile(r"\s*Abspielen\s*")
|
||||||
)
|
)
|
||||||
|
|
||||||
results: List[IliasPageElement] = []
|
results: List[IliasPageElement] = []
|
||||||
@ -731,10 +730,6 @@ class IliasPage:
|
|||||||
return IliasElementType.TEST
|
return IliasElementType.TEST
|
||||||
if "fold" in icon["class"]:
|
if "fold" in icon["class"]:
|
||||||
return IliasElementType.FOLDER
|
return IliasElementType.FOLDER
|
||||||
if "copa" in icon["class"]:
|
|
||||||
return IliasElementType.FOLDER
|
|
||||||
if "svy" in icon["class"]:
|
|
||||||
return IliasElementType.SURVEY
|
|
||||||
|
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
||||||
|
@ -194,7 +194,7 @@ instance's greatest bottleneck.
|
|||||||
self._links = section.links()
|
self._links = section.links()
|
||||||
self._videos = section.videos()
|
self._videos = section.videos()
|
||||||
self._forums = section.forums()
|
self._forums = section.forums()
|
||||||
self._visited_urls: Dict[str, PurePath] = dict()
|
self._visited_urls: Set[str] = set()
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
if isinstance(self._target, int):
|
if isinstance(self._target, int):
|
||||||
@ -348,11 +348,9 @@ instance's greatest bottleneck.
|
|||||||
) -> Optional[Coroutine[Any, Any, None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
if element.url in self._visited_urls:
|
if element.url in self._visited_urls:
|
||||||
raise CrawlWarning(
|
raise CrawlWarning(
|
||||||
f"Found second path to element {element.name!r} at {element.url!r}. "
|
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
|
||||||
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
|
|
||||||
+ f"Second path: {fmt_path(parent_path)}."
|
|
||||||
)
|
)
|
||||||
self._visited_urls[element.url] = parent_path
|
self._visited_urls.add(element.url)
|
||||||
|
|
||||||
element_path = PurePath(parent_path, element.name)
|
element_path = PurePath(parent_path, element.name)
|
||||||
|
|
||||||
@ -379,20 +377,9 @@ instance's greatest bottleneck.
|
|||||||
return None
|
return None
|
||||||
return await self._handle_forum(element, element_path)
|
return await self._handle_forum(element, element_path)
|
||||||
elif element.type == IliasElementType.TEST:
|
elif element.type == IliasElementType.TEST:
|
||||||
log.status(
|
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
|
||||||
"[bold bright_black]",
|
log.explain("Tests contain no relevant files")
|
||||||
"Ignored",
|
log.explain("Answer: No")
|
||||||
fmt_path(element_path),
|
|
||||||
"[bright_black](tests contain no relevant data)"
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
elif element.type == IliasElementType.SURVEY:
|
|
||||||
log.status(
|
|
||||||
"[bold bright_black]",
|
|
||||||
"Ignored",
|
|
||||||
fmt_path(element_path),
|
|
||||||
"[bright_black](surveys contain no relevant data)"
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
elif element.type == IliasElementType.LINK:
|
elif element.type == IliasElementType.LINK:
|
||||||
return await self._handle_link(element, element_path)
|
return await self._handle_link(element, element_path)
|
||||||
|
@ -2,7 +2,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
from typing import Awaitable, List, Optional, Pattern, Set, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
await self._stream_from_url(file.url, sink, bar)
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||||
page, url = await self.get_page()
|
page = await self.get_page()
|
||||||
elements: List[Tag] = self._find_file_links(page)
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
folder_label = self._find_folder_label(element)
|
folder_label = self._find_folder_label(element)
|
||||||
if folder_label:
|
if folder_label:
|
||||||
folder = self._extract_folder(folder_label, url)
|
folder = self._extract_folder(folder_label)
|
||||||
if folder not in items:
|
if folder not in items:
|
||||||
items.add(folder)
|
items.add(folder)
|
||||||
folder.explain()
|
folder.explain()
|
||||||
else:
|
else:
|
||||||
file = self._extract_file(element, url)
|
file = self._extract_file(element)
|
||||||
items.add(file)
|
items.add(file)
|
||||||
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
||||||
log.explain("Attributing it to root folder")
|
log.explain("Attributing it to root folder")
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
name = folder_tag.getText().strip()
|
name = folder_tag.getText().strip()
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="table")
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
for link in self._find_file_links(container):
|
for link in self._find_file_links(container):
|
||||||
files.append(self._extract_file(link, url))
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
return None
|
return None
|
||||||
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
|
|
||||||
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
|
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||||
url = self._abs_url_from_link(url, link)
|
url = self._abs_url_from_link(link)
|
||||||
name = os.path.basename(url)
|
name = os.path.basename(url)
|
||||||
return KitIpdFile(name, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||||
|
|
||||||
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||||
return urljoin(url, link_tag.get("href"))
|
return urljoin(self._url, link_tag.get("href"))
|
||||||
|
|
||||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
async with self.session.get(url, allow_redirects=False) as resp:
|
async with self.session.get(url, allow_redirects=False) as resp:
|
||||||
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
async def get_page(self) -> BeautifulSoup:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
# cause issues on other pages.
|
# cause issues on other pages.
|
||||||
content = (await request.read()).decode("utf-8")
|
content = (await request.read()).decode("utf-8")
|
||||||
content = re.sub(r"<!--.*?-->", "", content)
|
content = re.sub(r"<!--.*?-->", "", content)
|
||||||
return soupify(content.encode("utf-8")), str(request.url)
|
return soupify(content.encode("utf-8"))
|
||||||
|
53
PFERD/update.py
Normal file
53
PFERD/update.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
import ssl
|
||||||
|
from typing import Optional
|
||||||
|
import aiohttp
|
||||||
|
import certifi
|
||||||
|
|
||||||
|
from .version import NAME, VERSION
|
||||||
|
from .logging import log
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PferdUpdate:
|
||||||
|
release_url: str
|
||||||
|
version: str
|
||||||
|
|
||||||
|
|
||||||
|
def _build_session() -> aiohttp.ClientSession:
|
||||||
|
return aiohttp.ClientSession(
|
||||||
|
headers={"User-Agent": f"{NAME}/{VERSION}"},
|
||||||
|
connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
|
||||||
|
timeout=aiohttp.ClientTimeout(
|
||||||
|
total=15 * 60,
|
||||||
|
connect=10,
|
||||||
|
sock_connect=10,
|
||||||
|
sock_read=10,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def check_for_updates() -> None:
|
||||||
|
if new_version := await get_newer_version():
|
||||||
|
log.warn(
|
||||||
|
f"{NAME} version out of date. "
|
||||||
|
+ f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
|
||||||
|
)
|
||||||
|
log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
|
||||||
|
else:
|
||||||
|
log.explain("No update found")
|
||||||
|
|
||||||
|
|
||||||
|
async def get_newer_version() -> Optional[PferdUpdate]:
|
||||||
|
async with _build_session() as session:
|
||||||
|
async with session.get(
|
||||||
|
"https://api.github.com/repos/Garmelon/Pferd/releases/latest",
|
||||||
|
headers={"Accept": "application/vnd.github+json"}
|
||||||
|
) as response:
|
||||||
|
release_information = await response.json()
|
||||||
|
tag_name: str = release_information["tag_name"]
|
||||||
|
tag_name = tag_name.removeprefix("v")
|
||||||
|
if VERSION == tag_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return PferdUpdate(release_url=release_information["html_url"], version=tag_name)
|
@ -1,2 +1,2 @@
|
|||||||
NAME = "PFERD"
|
NAME = "PFERD"
|
||||||
VERSION = "3.4.3"
|
VERSION = "3.4.1"
|
||||||
|
@ -30,10 +30,7 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
|
|||||||
|
|
||||||
Unofficial packages are available for:
|
Unofficial packages are available for:
|
||||||
- [AUR](https://aur.archlinux.org/packages/pferd)
|
- [AUR](https://aur.archlinux.org/packages/pferd)
|
||||||
- [brew](https://formulae.brew.sh/formula/pferd)
|
|
||||||
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
|
|
||||||
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
||||||
- [PyPi](https://pypi.org/project/pferd)
|
|
||||||
|
|
||||||
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user