Bump version to 3.4.3

Fix crawling English opencast
Use url after redirect for relative links
2023-12-21 10:23:01 +01:00 · 2022-11-29 18:22:19 +01:00 · 2022-11-29 18:13:56 +01:00 · 2022-11-21 18:10:45 +01:00 · 2022-11-15 17:17:57 +01:00 · 2022-11-04 12:18:26 +01:00
7 changed files with 66 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,9 +22,30 @@ ambiguous situations.
 ## Unreleased
 ## 3.4.3 - 2022-11-29
 ### Added
 - Missing documentation for `forums` option
 ### Changed
 - Clear up error message shown when multiple paths are found to an element
 ### Fixed
- Forum crawling crashing when parsing empty (= 0 messages) threads
+- IPD crawler unnecessarily appending trailing slashes
 - Crawling opencast when ILIAS is set to English
 ## 3.4.2 - 2022-10-26
 ### Added
 - Recognize and crawl content pages in cards
 - Recognize and ignore surveys
 ### Fixed
 - Forum crawling crashing when a thread has no messages at all
 - Forum crawling crashing when a forum has no threads at all
 - Ilias login failing in some cases
 - Crawling of paginated future meetings
 - IPD crawler handling of URLs without trailing slash
 ## 3.4.1 - 2022-08-17
--- a/CONFIG.md
+++ b/CONFIG.md
@ -181,6 +181,7 @@ script once per day should be fine.
  redirect to the actual URL. Set to a negative value to disable the automatic
  redirect. (Default: `-1`)
 - `videos`: Whether to download videos. (Default: `no`)
 - `forums`: Whether to download forum threads. (Default: `no`)
 - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
  `20.0`)
@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
 Example: `foo/bar --> baz`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
 - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `bar/wargl`
+- Converts `foo/bar/wargl` into `baz/wargl`
 Example: `foo/bar --> !`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -24,6 +24,7 @@ class IliasElementType(Enum):
    LINK = "link"
    BOOKING = "booking"
    MEETING = "meeting"
    SURVEY = "survey"
    VIDEO = "video"
    VIDEO_PLAYER = "video_player"
    VIDEO_FOLDER = "video_folder"
@ -133,7 +134,7 @@ class IliasPage:
        thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
-        form_data: Dict[str, Union[str, List[ſtr]]] = {
+        form_data: Dict[str, Union[str, List[str]]] = {
            "thread_ids[]": thread_ids,
            "selected_cmd2": "html",
            "select_cmd2": "Ausführen",
@ -365,7 +366,7 @@ class IliasPage:
        """
        # Video start links are marked with an "Abspielen" link
        video_links: List[Tag] = self._soup.findAll(
-            name="a", text=re.compile(r"\s*Abspielen\s*")
+            name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
        )
        results: List[IliasPageElement] = []
@ -730,6 +731,10 @@ class IliasPage:
            return IliasElementType.TEST
        if "fold" in icon["class"]:
            return IliasElementType.FOLDER
        if "copa" in icon["class"]:
            return IliasElementType.FOLDER
        if "svy" in icon["class"]:
            return IliasElementType.SURVEY
        _unexpected_html_warning()
        log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -194,7 +194,7 @@ instance's greatest bottleneck.
        self._links = section.links()
        self._videos = section.videos()
        self._forums = section.forums()
-        self._visited_urls: Set[str] = set()
+        self._visited_urls: Dict[str, PurePath] = dict()
    async def _run(self) -> None:
        if isinstance(self._target, int):
@ -348,9 +348,11 @@ instance's greatest bottleneck.
    ) -> Optional[Coroutine[Any, Any, None]]:
        if element.url in self._visited_urls:
            raise CrawlWarning(
-                f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
+                f"Found second path to element {element.name!r} at {element.url!r}. "
                + f"First path: {fmt_path(self._visited_urls[element.url])}. "
                + f"Second path: {fmt_path(parent_path)}."
            )
-        self._visited_urls.add(element.url)
+        self._visited_urls[element.url] = parent_path
        element_path = PurePath(parent_path, element.name)
@ -377,9 +379,20 @@ instance's greatest bottleneck.
                return None
            return await self._handle_forum(element, element_path)
        elif element.type == IliasElementType.TEST:
-            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
+            log.status(
-            log.explain("Tests contain no relevant files")
+                "[bold bright_black]",
-            log.explain("Answer: No")
+                "Ignored",
                fmt_path(element_path),
                "[bright_black](tests contain no relevant data)"
            )
            return None
        elif element.type == IliasElementType.SURVEY:
            log.status(
                "[bold bright_black]",
                "Ignored",
                fmt_path(element_path),
                "[bright_black](surveys contain no relevant data)"
            )
            return None
        elif element.type == IliasElementType.LINK:
            return await self._handle_link(element, element_path)
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@ -2,7 +2,7 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import PurePath
-from typing import Awaitable, List, Optional, Pattern, Set, Union
+from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, Tag
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
            await self._stream_from_url(file.url, sink, bar)
    async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
-        page = await self.get_page()
+        page, url = await self.get_page()
        elements: List[Tag] = self._find_file_links(page)
        items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
        for element in elements:
            folder_label = self._find_folder_label(element)
            if folder_label:
-                folder = self._extract_folder(folder_label)
+                folder = self._extract_folder(folder_label, url)
                if folder not in items:
                    items.add(folder)
                    folder.explain()
            else:
-                file = self._extract_file(element)
+                file = self._extract_file(element, url)
                items.add(file)
                log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
                log.explain("Attributing it to root folder")
        return items
-    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
+    def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
        files: List[KitIpdFile] = []
        name = folder_tag.getText().strip()
        container: Tag = folder_tag.findNextSibling(name="table")
        for link in self._find_file_links(container):
-            files.append(self._extract_file(link))
+            files.append(self._extract_file(link, url))
        return KitIpdFolder(name, files)
@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
            return None
        return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
-    def _extract_file(self, link: Tag) -> KitIpdFile:
+    def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
-        url = self._abs_url_from_link(link)
+        url = self._abs_url_from_link(url, link)
        name = os.path.basename(url)
        return KitIpdFile(name, url)
    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
        return tag.findAll(name="a", attrs={"href": self._file_regex})
-    def _abs_url_from_link(self, link_tag: Tag) -> str:
+    def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
-        return urljoin(self._url, link_tag.get("href"))
+        return urljoin(url, link_tag.get("href"))
    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
            sink.done()
-    async def get_page(self) -> BeautifulSoup:
+    async def get_page(self) -> Tuple[BeautifulSoup, str]:
        async with self.session.get(self._url) as request:
            # The web page for Algorithmen für Routenplanung contains some
            # weird comments that beautifulsoup doesn't parse correctly. This
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
            # cause issues on other pages.
            content = (await request.read()).decode("utf-8")
            content = re.sub(r"<!--.*?-->", "", content)
-            return soupify(content.encode("utf-8"))
+            return soupify(content.encode("utf-8")), str(request.url)
--- a/PFERD/version.py
+++ b/PFERD/version.py
@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.4.1"
+VERSION = "3.4.3"
--- a/README.md
+++ b/README.md
@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
 Unofficial packages are available for:
 - [AUR](https://aur.archlinux.org/packages/pferd)
 - [brew](https://formulae.brew.sh/formula/pferd)
 - [conda-forge](https://github.com/conda-forge/pferd-feedstock)
 - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
 - [PyPi](https://pypi.org/project/pferd)
 See also PFERD's [repology page](https://repology.org/project/pferd/versions).
Author	SHA1	Message	Date
Joscha	6d44aac278	Bump version to 3.4.3	2022-11-29 18:22:19 +01:00
c0derMo	55a2de6b88	Fix crawling English opencast	2022-11-29 18:13:56 +01:00
Joscha	c0d6d8b229	Use url after redirect for relative links	2022-11-21 18:10:45 +01:00
Joscha	635caa765d	Fix typo Thanks, burg113	2022-11-15 17:17:57 +01:00
Pavel Zwerschke	e69b55b349	Add more unofficial package managers (#66 )	2022-11-04 12:18:26 +01:00
Joscha	07200bbde5	Document ilias web crawler's forums option	2022-10-31 14:12:27 +01:00
I-Al-Istannen	c020cccc64	Include found paths in "second path found" warning	2022-10-29 14:08:29 +02:00
Joscha	259cfc20cc	Bump version to 3.4.2	2022-10-26 18:26:17 +02:00
Joscha	37b51a66d8	Update changelog	2022-10-26 18:22:37 +02:00
I-Al-Istannen	f47d2f11d8	Append trailing slash to kit-ipd links to ensure urljoin works as expected	2022-10-25 20:28:22 +02:00
I-Al-Istannen	1b6be6bd79	Handle content pages in cards	2022-10-24 18:37:26 +02:00
I-Al-Istannen	e1430e6298	Handle (and ignore) surveys	2022-10-24 18:37:26 +02:00
`@ -1,2 +1,2 @@`
	`NAME = "PFERD"`	`NAME = "PFERD"`
	`VERSION = "3.4.1"`	`VERSION = "3.4.3"`