Check for new versions at startup

2026-01-11 23:02:30 +01:00 · 2022-10-24 17:31:34 +02:00
10 changed files with 88 additions and 66 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,30 +22,9 @@ ambiguous situations.

 ## Unreleased

-## 3.4.3 - 2022-11-29
-
-### Added
- Missing documentation for `forums` option
-
-### Changed
- Clear up error message shown when multiple paths are found to an element
-
 ### Fixed
- IPD crawler unnecessarily appending trailing slashes
- Crawling opencast when ILIAS is set to English
-
-## 3.4.2 - 2022-10-26
-
-### Added
- Recognize and crawl content pages in cards
- Recognize and ignore surveys
-
-### Fixed
- Forum crawling crashing when a thread has no messages at all
+- Forum crawling crashing when parsing empty (= 0 messages) threads
 - Forum crawling crashing when a forum has no threads at all
- Ilias login failing in some cases
- Crawling of paginated future meetings
- IPD crawler handling of URLs without trailing slash

 ## 3.4.1 - 2022-08-17

--- a/CONFIG.md
+++ b/CONFIG.md
@@ -181,7 +181,6 @@ script once per day should be fine.
  redirect to the actual URL. Set to a negative value to disable the automatic
  redirect. (Default: `-1`)
 - `videos`: Whether to download videos. (Default: `no`)
- `forums`: Whether to download forum threads. (Default: `no`)
 - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
  `20.0`)

@@ -290,7 +289,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
 Example: `foo/bar --> baz`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
 - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `baz/wargl`
+- Converts `foo/bar/wargl` into `bar/wargl`

 Example: `foo/bar --> !`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
--- a/PFERD/main.py
+++ b/PFERD/main.py
@@ -5,6 +5,8 @@ import os
 import sys
 from pathlib import Path

+from PFERD.update import check_for_updates
+
 from .auth import AuthLoadError
 from .cli import PARSER, ParserLoadError, load_default_section
 from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
@@ -134,6 +136,11 @@ def main() -> None:
            loop.run_until_complete(asyncio.sleep(1))
            loop.close()
        else:
+            log.explain_topic("Checking for updates")
+            if not args.skip_update_check:
+                asyncio.run(check_for_updates())
+            else:
+                log.explain("Update check skipped due to configuration option")
            asyncio.run(pferd.run(args.debug_transforms))
    except (ConfigOptionError, AuthLoadError) as e:
        log.unlock()
--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@@ -151,6 +151,11 @@ PARSER.add_argument(
    action="version",
    version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
 )
+PARSER.add_argument(
+    "--skip-update-check",
+    action="store_true",
+    help="disable automatic update checks at startup"
+)
 PARSER.add_argument(
    "--config", "-c",
    type=Path,
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -24,7 +24,6 @@ class IliasElementType(Enum):
    LINK = "link"
    BOOKING = "booking"
    MEETING = "meeting"
-    SURVEY = "survey"
    VIDEO = "video"
    VIDEO_PLAYER = "video_player"
    VIDEO_FOLDER = "video_folder"
@@ -134,7 +133,7 @@ class IliasPage:

        thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]

-        form_data: Dict[str, Union[str, List[str]]] = {
+        form_data: Dict[str, Union[str, List[ſtr]]] = {
            "thread_ids[]": thread_ids,
            "selected_cmd2": "html",
            "select_cmd2": "Ausführen",
@@ -366,7 +365,7 @@ class IliasPage:
        """
        # Video start links are marked with an "Abspielen" link
        video_links: List[Tag] = self._soup.findAll(
-            name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
+            name="a", text=re.compile(r"\s*Abspielen\s*")
        )

        results: List[IliasPageElement] = []
@@ -731,10 +730,6 @@ class IliasPage:
            return IliasElementType.TEST
        if "fold" in icon["class"]:
            return IliasElementType.FOLDER
-        if "copa" in icon["class"]:
-            return IliasElementType.FOLDER
-        if "svy" in icon["class"]:
-            return IliasElementType.SURVEY

        _unexpected_html_warning()
        log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -194,7 +194,7 @@ instance's greatest bottleneck.
        self._links = section.links()
        self._videos = section.videos()
        self._forums = section.forums()
-        self._visited_urls: Dict[str, PurePath] = dict()
+        self._visited_urls: Set[str] = set()

    async def _run(self) -> None:
        if isinstance(self._target, int):
@@ -348,11 +348,9 @@ instance's greatest bottleneck.
    ) -> Optional[Coroutine[Any, Any, None]]:
        if element.url in self._visited_urls:
            raise CrawlWarning(
-                f"Found second path to element {element.name!r} at {element.url!r}. "
-                + f"First path: {fmt_path(self._visited_urls[element.url])}. "
-                + f"Second path: {fmt_path(parent_path)}."
+                f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
            )
-        self._visited_urls[element.url] = parent_path
+        self._visited_urls.add(element.url)

        element_path = PurePath(parent_path, element.name)

@@ -379,20 +377,9 @@ instance's greatest bottleneck.
                return None
            return await self._handle_forum(element, element_path)
        elif element.type == IliasElementType.TEST:
-            log.status(
-                "[bold bright_black]",
-                "Ignored",
-                fmt_path(element_path),
-                "[bright_black](tests contain no relevant data)"
-            )
-            return None
-        elif element.type == IliasElementType.SURVEY:
-            log.status(
-                "[bold bright_black]",
-                "Ignored",
-                fmt_path(element_path),
-                "[bright_black](surveys contain no relevant data)"
-            )
+            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
+            log.explain("Tests contain no relevant files")
+            log.explain("Answer: No")
            return None
        elif element.type == IliasElementType.LINK:
            return await self._handle_link(element, element_path)
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -2,7 +2,7 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import PurePath
-from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
+from typing import Awaitable, List, Optional, Pattern, Set, Union
 from urllib.parse import urljoin

 from bs4 import BeautifulSoup, Tag
@@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
            await self._stream_from_url(file.url, sink, bar)

    async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
-        page, url = await self.get_page()
+        page = await self.get_page()
        elements: List[Tag] = self._find_file_links(page)
        items: Set[Union[KitIpdFile, KitIpdFolder]] = set()

        for element in elements:
            folder_label = self._find_folder_label(element)
            if folder_label:
-                folder = self._extract_folder(folder_label, url)
+                folder = self._extract_folder(folder_label)
                if folder not in items:
                    items.add(folder)
                    folder.explain()
            else:
-                file = self._extract_file(element, url)
+                file = self._extract_file(element)
                items.add(file)
                log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
                log.explain("Attributing it to root folder")

        return items

-    def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
+    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
        files: List[KitIpdFile] = []
        name = folder_tag.getText().strip()

        container: Tag = folder_tag.findNextSibling(name="table")
        for link in self._find_file_links(container):
-            files.append(self._extract_file(link, url))
+            files.append(self._extract_file(link))

        return KitIpdFolder(name, files)

@@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
            return None
        return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))

-    def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
-        url = self._abs_url_from_link(url, link)
+    def _extract_file(self, link: Tag) -> KitIpdFile:
+        url = self._abs_url_from_link(link)
        name = os.path.basename(url)
        return KitIpdFile(name, url)

    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
        return tag.findAll(name="a", attrs={"href": self._file_regex})

-    def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
-        return urljoin(url, link_tag.get("href"))
+    def _abs_url_from_link(self, link_tag: Tag) -> str:
+        return urljoin(self._url, link_tag.get("href"))

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
@@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):

            sink.done()

-    async def get_page(self) -> Tuple[BeautifulSoup, str]:
+    async def get_page(self) -> BeautifulSoup:
        async with self.session.get(self._url) as request:
            # The web page for Algorithmen für Routenplanung contains some
            # weird comments that beautifulsoup doesn't parse correctly. This
@@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
            # cause issues on other pages.
            content = (await request.read()).decode("utf-8")
            content = re.sub(r"<!--.*?-->", "", content)
-            return soupify(content.encode("utf-8")), str(request.url)
+            return soupify(content.encode("utf-8"))
--- a/PFERD/update.py
+++ b/PFERD/update.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass
+import ssl
+from typing import Optional
+import aiohttp
+import certifi
+
+from .version import NAME, VERSION
+from .logging import log
+
+
+@dataclass
+class PferdUpdate:
+    release_url: str
+    version: str
+
+
+def _build_session() -> aiohttp.ClientSession:
+    return aiohttp.ClientSession(
+        headers={"User-Agent": f"{NAME}/{VERSION}"},
+        connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
+        timeout=aiohttp.ClientTimeout(
+            total=15 * 60,
+            connect=10,
+            sock_connect=10,
+            sock_read=10,
+        )
+    )
+
+
+async def check_for_updates() -> None:
+    if new_version := await get_newer_version():
+        log.warn(
+            f"{NAME} version out of date. "
+            + f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
+        )
+        log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
+    else:
+        log.explain("No update found")
+
+
+async def get_newer_version() -> Optional[PferdUpdate]:
+    async with _build_session() as session:
+        async with session.get(
+            "https://api.github.com/repos/Garmelon/Pferd/releases/latest",
+            headers={"Accept": "application/vnd.github+json"}
+        ) as response:
+            release_information = await response.json()
+            tag_name: str = release_information["tag_name"]
+            tag_name = tag_name.removeprefix("v")
+            if VERSION == tag_name:
+                return None
+
+            return PferdUpdate(release_url=release_information["html_url"], version=tag_name)
--- a/PFERD/version.py
+++ b/PFERD/version.py
@@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.4.3"
+VERSION = "3.4.1"
--- a/README.md
+++ b/README.md
@@ -30,10 +30,7 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.

 Unofficial packages are available for:
 - [AUR](https://aur.archlinux.org/packages/pferd)
- [brew](https://formulae.brew.sh/formula/pferd)
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
 - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
- [PyPi](https://pypi.org/project/pferd)

 See also PFERD's [repology page](https://repology.org/project/pferd/versions).