Bump version to 3.4.3

Fix crawling English opencast
Use url after redirect for relative links
2023-12-21 10:23:01 +01:00 · 2022-11-29 18:22:19 +01:00 · 2022-11-29 18:13:56 +01:00 · 2022-11-21 18:10:45 +01:00 · 2022-11-15 17:17:57 +01:00 · 2022-11-04 12:18:26 +01:00
10 changed files with 66 additions and 88 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,9 +22,30 @@ ambiguous situations.

 ## Unreleased

+## 3.4.3 - 2022-11-29
+
+### Added
+- Missing documentation for `forums` option
+
+### Changed
+- Clear up error message shown when multiple paths are found to an element
+
 ### Fixed
- Forum crawling crashing when parsing empty (= 0 messages) threads
+- IPD crawler unnecessarily appending trailing slashes
+- Crawling opencast when ILIAS is set to English
+
+## 3.4.2 - 2022-10-26
+
+### Added
+- Recognize and crawl content pages in cards
+- Recognize and ignore surveys
+
+### Fixed
+- Forum crawling crashing when a thread has no messages at all
 - Forum crawling crashing when a forum has no threads at all
+- Ilias login failing in some cases
+- Crawling of paginated future meetings
+- IPD crawler handling of URLs without trailing slash

 ## 3.4.1 - 2022-08-17

--- a/CONFIG.md
+++ b/CONFIG.md
@ -181,6 +181,7 @@ script once per day should be fine.
  redirect to the actual URL. Set to a negative value to disable the automatic
  redirect. (Default: `-1`)
 - `videos`: Whether to download videos. (Default: `no`)
+- `forums`: Whether to download forum threads. (Default: `no`)
 - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
  `20.0`)

@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
 Example: `foo/bar --> baz`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
 - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `bar/wargl`
+- Converts `foo/bar/wargl` into `baz/wargl`

 Example: `foo/bar --> !`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
--- a/PFERD/main.py
+++ b/PFERD/main.py
@ -5,8 +5,6 @@ import os
 import sys
 from pathlib import Path

-from PFERD.update import check_for_updates
-
 from .auth import AuthLoadError
 from .cli import PARSER, ParserLoadError, load_default_section
 from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
@ -136,11 +134,6 @@ def main() -> None:
            loop.run_until_complete(asyncio.sleep(1))
            loop.close()
        else:
-            log.explain_topic("Checking for updates")
-            if not args.skip_update_check:
-                asyncio.run(check_for_updates())
-            else:
-                log.explain("Update check skipped due to configuration option")
            asyncio.run(pferd.run(args.debug_transforms))
    except (ConfigOptionError, AuthLoadError) as e:
        log.unlock()
--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@ -151,11 +151,6 @@ PARSER.add_argument(
    action="version",
    version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
 )
-PARSER.add_argument(
-    "--skip-update-check",
-    action="store_true",
-    help="disable automatic update checks at startup"
-)
 PARSER.add_argument(
    "--config", "-c",
    type=Path,
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -24,6 +24,7 @@ class IliasElementType(Enum):
    LINK = "link"
    BOOKING = "booking"
    MEETING = "meeting"
+    SURVEY = "survey"
    VIDEO = "video"
    VIDEO_PLAYER = "video_player"
    VIDEO_FOLDER = "video_folder"
@ -133,7 +134,7 @@ class IliasPage:

        thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]

-        form_data: Dict[str, Union[str, List[ſtr]]] = {
+        form_data: Dict[str, Union[str, List[str]]] = {
            "thread_ids[]": thread_ids,
            "selected_cmd2": "html",
            "select_cmd2": "Ausführen",
@ -365,7 +366,7 @@ class IliasPage:
        """
        # Video start links are marked with an "Abspielen" link
        video_links: List[Tag] = self._soup.findAll(
-            name="a", text=re.compile(r"\s*Abspielen\s*")
+            name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
        )

        results: List[IliasPageElement] = []
@ -730,6 +731,10 @@ class IliasPage:
            return IliasElementType.TEST
        if "fold" in icon["class"]:
            return IliasElementType.FOLDER
+        if "copa" in icon["class"]:
+            return IliasElementType.FOLDER
+        if "svy" in icon["class"]:
+            return IliasElementType.SURVEY

        _unexpected_html_warning()
        log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -194,7 +194,7 @@ instance's greatest bottleneck.
        self._links = section.links()
        self._videos = section.videos()
        self._forums = section.forums()
-        self._visited_urls: Set[str] = set()
+        self._visited_urls: Dict[str, PurePath] = dict()

    async def _run(self) -> None:
        if isinstance(self._target, int):
@ -348,9 +348,11 @@ instance's greatest bottleneck.
    ) -> Optional[Coroutine[Any, Any, None]]:
        if element.url in self._visited_urls:
            raise CrawlWarning(
-                f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
+                f"Found second path to element {element.name!r} at {element.url!r}. "
+                + f"First path: {fmt_path(self._visited_urls[element.url])}. "
+                + f"Second path: {fmt_path(parent_path)}."
            )
-        self._visited_urls.add(element.url)
+        self._visited_urls[element.url] = parent_path

        element_path = PurePath(parent_path, element.name)

@ -377,9 +379,20 @@ instance's greatest bottleneck.
                return None
            return await self._handle_forum(element, element_path)
        elif element.type == IliasElementType.TEST:
-            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
-            log.explain("Tests contain no relevant files")
-            log.explain("Answer: No")
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](tests contain no relevant data)"
+            )
+            return None
+        elif element.type == IliasElementType.SURVEY:
+            log.status(
+                "[bold bright_black]",
+                "Ignored",
+                fmt_path(element_path),
+                "[bright_black](surveys contain no relevant data)"
+            )
            return None
        elif element.type == IliasElementType.LINK:
            return await self._handle_link(element, element_path)
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@ -2,7 +2,7 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import PurePath
-from typing import Awaitable, List, Optional, Pattern, Set, Union
+from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
 from urllib.parse import urljoin

 from bs4 import BeautifulSoup, Tag
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
            await self._stream_from_url(file.url, sink, bar)

    async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
-        page = await self.get_page()
+        page, url = await self.get_page()
        elements: List[Tag] = self._find_file_links(page)
        items: Set[Union[KitIpdFile, KitIpdFolder]] = set()

        for element in elements:
            folder_label = self._find_folder_label(element)
            if folder_label:
-                folder = self._extract_folder(folder_label)
+                folder = self._extract_folder(folder_label, url)
                if folder not in items:
                    items.add(folder)
                    folder.explain()
            else:
-                file = self._extract_file(element)
+                file = self._extract_file(element, url)
                items.add(file)
                log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
                log.explain("Attributing it to root folder")

        return items

-    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
+    def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
        files: List[KitIpdFile] = []
        name = folder_tag.getText().strip()

        container: Tag = folder_tag.findNextSibling(name="table")
        for link in self._find_file_links(container):
-            files.append(self._extract_file(link))
+            files.append(self._extract_file(link, url))

        return KitIpdFolder(name, files)

@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
            return None
        return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))

-    def _extract_file(self, link: Tag) -> KitIpdFile:
-        url = self._abs_url_from_link(link)
+    def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
+        url = self._abs_url_from_link(url, link)
        name = os.path.basename(url)
        return KitIpdFile(name, url)

    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
        return tag.findAll(name="a", attrs={"href": self._file_regex})

-    def _abs_url_from_link(self, link_tag: Tag) -> str:
-        return urljoin(self._url, link_tag.get("href"))
+    def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
+        return urljoin(url, link_tag.get("href"))

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):

            sink.done()

-    async def get_page(self) -> BeautifulSoup:
+    async def get_page(self) -> Tuple[BeautifulSoup, str]:
        async with self.session.get(self._url) as request:
            # The web page for Algorithmen für Routenplanung contains some
            # weird comments that beautifulsoup doesn't parse correctly. This
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
            # cause issues on other pages.
            content = (await request.read()).decode("utf-8")
            content = re.sub(r"<!--.*?-->", "", content)
-            return soupify(content.encode("utf-8"))
+            return soupify(content.encode("utf-8")), str(request.url)
--- a/PFERD/update.py
+++ b/PFERD/update.py
@ -1,53 +0,0 @@
-from dataclasses import dataclass
-import ssl
-from typing import Optional
-import aiohttp
-import certifi
-
-from .version import NAME, VERSION
-from .logging import log
-
-
-@dataclass
-class PferdUpdate:
-    release_url: str
-    version: str
-
-
-def _build_session() -> aiohttp.ClientSession:
-    return aiohttp.ClientSession(
-        headers={"User-Agent": f"{NAME}/{VERSION}"},
-        connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
-        timeout=aiohttp.ClientTimeout(
-            total=15 * 60,
-            connect=10,
-            sock_connect=10,
-            sock_read=10,
-        )
-    )
-
-
-async def check_for_updates() -> None:
-    if new_version := await get_newer_version():
-        log.warn(
-            f"{NAME} version out of date. "
-            + f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
-        )
-        log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
-    else:
-        log.explain("No update found")
-
-
-async def get_newer_version() -> Optional[PferdUpdate]:
-    async with _build_session() as session:
-        async with session.get(
-            "https://api.github.com/repos/Garmelon/Pferd/releases/latest",
-            headers={"Accept": "application/vnd.github+json"}
-        ) as response:
-            release_information = await response.json()
-            tag_name: str = release_information["tag_name"]
-            tag_name = tag_name.removeprefix("v")
-            if VERSION == tag_name:
-                return None
-
-            return PferdUpdate(release_url=release_information["html_url"], version=tag_name)
--- a/PFERD/version.py
+++ b/PFERD/version.py
@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.4.1"
+VERSION = "3.4.3"
--- a/README.md
+++ b/README.md
@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.

 Unofficial packages are available for:
 - [AUR](https://aur.archlinux.org/packages/pferd)
+- [brew](https://formulae.brew.sh/formula/pferd)
+- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
 - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
+- [PyPi](https://pypi.org/project/pferd)

 See also PFERD's [repology page](https://repology.org/project/pferd/versions).
Author	SHA1	Message	Date
Joscha	6d44aac278	Bump version to 3.4.3	2022-11-29 18:22:19 +01:00
c0derMo	55a2de6b88	Fix crawling English opencast	2022-11-29 18:13:56 +01:00
Joscha	c0d6d8b229	Use url after redirect for relative links	2022-11-21 18:10:45 +01:00
Joscha	635caa765d	Fix typo Thanks, burg113	2022-11-15 17:17:57 +01:00
Pavel Zwerschke	e69b55b349	Add more unofficial package managers (#66 )	2022-11-04 12:18:26 +01:00
Joscha	07200bbde5	Document ilias web crawler's forums option	2022-10-31 14:12:27 +01:00
I-Al-Istannen	c020cccc64	Include found paths in "second path found" warning	2022-10-29 14:08:29 +02:00
Joscha	259cfc20cc	Bump version to 3.4.2	2022-10-26 18:26:17 +02:00
Joscha	37b51a66d8	Update changelog	2022-10-26 18:22:37 +02:00
I-Al-Istannen	f47d2f11d8	Append trailing slash to kit-ipd links to ensure urljoin works as expected	2022-10-25 20:28:22 +02:00
I-Al-Istannen	1b6be6bd79	Handle content pages in cards	2022-10-24 18:37:26 +02:00
I-Al-Istannen	e1430e6298	Handle (and ignore) surveys	2022-10-24 18:37:26 +02:00