Bump version to 3.4.0

Use utf-8 for credential file
Use utf-8 for cookies
2023-12-21 10:23:01 +01:00 · 2022-05-01 22:29:06 +02:00 · 2022-04-29 23:15:12 +02:00 · 2022-04-29 23:12:41 +02:00 · 2022-04-29 23:11:27 +02:00 · 2022-04-27 22:52:50 +02:00
16 changed files with 138 additions and 39 deletions
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@ -14,7 +14,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]
-        python: ["3.8"]
+        python: ["3.9"]
    steps:
      - uses: actions/checkout@v2
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,26 @@ ambiguous situations.
 ## Unreleased
 ## 3.4.0 - 2022-05-01
 ### Added
 - Message when Shibboleth entitlements need to be manually reviewed
 - Links to unofficial packages and repology in the readme
 ### Changed
 - Increase minimum supported Python version to 3.9
 - Support video listings with more columns
 - Use UTF-8 when reading/writing the config file
 ### Fixed
 - Crash during authentication when the Shibboleth session is still valid
 ## 3.3.1 - 2022-01-15
 ### Fixed
 - ILIAS login
 - Local video cache if `windows_paths` is enabled
 ## 3.3.0 - 2022-01-09
 ### Added
--- a/PFERD/main.py
+++ b/PFERD/main.py
@ -159,3 +159,7 @@ def main() -> None:
        sys.exit(1)
    else:
        pferd.print_report()
 if __name__ == "__main__":
    main()
--- a/PFERD/auth/credential_file.py
+++ b/PFERD/auth/credential_file.py
@ -20,8 +20,10 @@ class CredentialFileAuthenticator(Authenticator):
        path = config.default_section.working_dir() / section.path()
        try:
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                lines = list(f)
        except UnicodeDecodeError:
            raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8")
        except OSError as e:
            raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e
--- a/PFERD/config.py
+++ b/PFERD/config.py
@ -120,7 +120,7 @@ class Config:
        # Using config.read_file instead of config.read because config.read
        # would just ignore a missing file and carry on.
        try:
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                parser.read_file(f, source=str(path))
        except FileNotFoundError:
            raise ConfigLoadError(path, "File does not exist")
@ -128,6 +128,8 @@ class Config:
            raise ConfigLoadError(path, "That's a directory, not a file")
        except PermissionError:
            raise ConfigLoadError(path, "Insufficient permissions")
        except UnicodeDecodeError:
            raise ConfigLoadError(path, "File is not encoded using UTF-8")
    def dump(self, path: Optional[Path] = None) -> None:
        """
@ -154,12 +156,12 @@ class Config:
            try:
                # x = open for exclusive creation, failing if the file already
                # exists
-                with open(path, "x") as f:
+                with open(path, "x", encoding="utf-8") as f:
                    self._parser.write(f)
            except FileExistsError:
                print("That file already exists.")
                if asyncio.run(prompt_yes_no("Overwrite it?", default=False)):
-                    with open(path, "w") as f:
+                    with open(path, "w", encoding="utf-8") as f:
                        self._parser.write(f)
                else:
                    raise ConfigDumpError(path, "File already exists")
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@ -1,9 +1,10 @@
 import asyncio
 import os
 from abc import ABC, abstractmethod
 from collections.abc import Awaitable, Coroutine
 from datetime import datetime
 from pathlib import Path, PurePath
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
 from ..auth import Authenticator
 from ..config import Config, Section
@ -58,7 +59,7 @@ def noncritical(f: Wrapped) -> Wrapped:
    return wrapper  # type: ignore
-AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
+AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]])
 def anoncritical(f: AWrapped) -> AWrapped:
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@ -108,7 +108,7 @@ class HttpCrawler(Crawler):
    def _load_cookies_from_file(self, path: Path) -> None:
        jar: Any = http.cookies.SimpleCookie()
-        with open(path) as f:
+        with open(path, encoding="utf-8") as f:
            for i, line in enumerate(f):
                # Names of headers are case insensitive
                if line[:11].lower() == "set-cookie:":
@ -121,7 +121,7 @@ class HttpCrawler(Crawler):
        jar: Any = http.cookies.SimpleCookie()
        for morsel in self._cookie_jar:
            jar[morsel.key] = morsel
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
            f.write(jar.output(sep="\n"))
            f.write("\n")  # A trailing newline is just common courtesy
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -280,11 +280,22 @@ class IliasPage:
    def _listed_video_to_element(self, link: Tag) -> IliasPageElement:
        # The link is part of a table with multiple columns, describing metadata.
-        # 6th child (1 indexed) is the modification time string
+        # 6th or 7th child (1 indexed) is the modification time string. Try to find it
-        modification_string = link.parent.parent.parent.select_one(
+        # by parsing backwards from the end and finding something that looks like a date
-            "td.std:nth-child(6)"
+        modification_time = None
-        ).getText().strip()
+        row: Tag = link.parent.parent.parent
-        modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
+        column_count = len(row.select("td.std"))
        for index in range(column_count, 0, -1):
            modification_string = link.parent.parent.parent.select_one(
                f"td.std:nth-child({index})"
            ).getText().strip()
            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
                modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
                break
        if modification_time is None:
            log.warn(f"Could not determine upload time for {link}")
            modification_time = datetime.now()
        title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip()
        title += ".mp4"
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -1,9 +1,11 @@
 import asyncio
 import re
 from collections.abc import Awaitable, Coroutine
 from pathlib import PurePath
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Set, Union, cast
 import aiohttp
 import yarl
 from aiohttp import hdrs
 from bs4 import BeautifulSoup, Tag
@ -12,7 +14,7 @@ from ...config import Config
 from ...logging import ProgressBar, log
 from ...output_dir import FileSink, Redownload
 from ...utils import fmt_path, soupify, url_set_query_param
-from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
+from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import Links
 from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
@ -81,8 +83,6 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([
    IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
 ])
 AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
 def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
    def decorator(f: AWrapped) -> AWrapped:
@ -251,7 +251,7 @@ instance's greatest bottleneck.
        url: str,
        parent: IliasPageElement,
        path: PurePath,
-    ) -> Optional[Awaitable[None]]:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        maybe_cl = await self.crawl(path)
        if not maybe_cl:
            return None
@ -309,7 +309,7 @@ instance's greatest bottleneck.
        self,
        parent_path: PurePath,
        element: IliasPageElement,
-    ) -> Optional[Awaitable[None]]:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        if element.url in self._visited_urls:
            raise CrawlWarning(
                f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
@ -359,7 +359,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Awaitable[None]]:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
        log.explain(f"Links type is {self._links}")
@ -406,7 +406,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Awaitable[None]]:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}")
        log.explain(f"Links type is {self._links}")
@ -442,7 +442,7 @@ instance's greatest bottleneck.
            if hdrs.LOCATION not in resp.headers:
                return soupify(await resp.read()).select_one("a").get("href").strip()
-        self._authenticate()
+        await self._authenticate()
        async with self.session.get(export_url, allow_redirects=False) as resp:
            # No redirect means we were authenticated
@ -455,7 +455,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Awaitable[None]]:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        # Copy old mapping as it is likely still relevant
        if self.prev_report:
            self.report.add_custom_value(
@ -498,7 +498,7 @@ instance's greatest bottleneck.
            log.explain_topic(f"Checking local cache for video {video_path.name}")
            all_found_locally = True
            for video in contained_videos:
-                transformed_path = self._transformer.transform(video)
+                transformed_path = self._to_local_video_path(video)
                if transformed_path:
                    exists_locally = self._output_dir.resolve(transformed_path).exists()
                    all_found_locally = all_found_locally and exists_locally
@ -508,6 +508,11 @@ instance's greatest bottleneck.
            log.explain("Missing at least one video, continuing with requests!")
        return False
    def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]:
        if transformed := self._transformer.transform(path):
            return self._deduplicator.fixup_path(transformed)
        return None
    @anoncritical
    @_iorepeat(3, "downloading video")
    async def _download_video(
@ -527,7 +532,7 @@ instance's greatest bottleneck.
                log.explain(f"Using single video mode for {element.name}")
                stream_element = stream_elements[0]
-                transformed_path = self._transformer.transform(original_path)
+                transformed_path = self._to_local_video_path(original_path)
                if not transformed_path:
                    raise CrawlError(f"Download returned a path but transform did not for {original_path}")
@ -558,7 +563,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Awaitable[None]]:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
            return None
@ -674,14 +679,14 @@ class KitShibbolethLogin:
        # Equivalent: Click on "Mit KIT-Account anmelden" button in
        # https://ilias.studium.kit.edu/login.php
-        url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login"
+        url = "https://ilias.studium.kit.edu/shib_login.php"
        data = {
            "sendLogin": "1",
            "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
-            "target": "/shib_login.php",
+            "il_target": "",
-            "home_organization_selection": "Mit KIT-Account anmelden",
+            "home_organization_selection": "Weiter",
        }
-        soup: BeautifulSoup = await _post(sess, url, data)
+        soup: BeautifulSoup = await _shib_post(sess, url, data)
        # Attempt to login using credentials, if necessary
        while not self._login_successful(soup):
@ -704,6 +709,12 @@ class KitShibbolethLogin:
            }
            soup = await _post(sess, url, data)
            if soup.find(id="attributeRelease"):
                raise CrawlError(
                    "ILIAS Shibboleth entitlements changed! "
                    "Please log in once in your browser and review them"
                )
            if self._tfa_required(soup):
                soup = await self._authenticate_tfa(sess, soup)
@ -761,3 +772,37 @@ class KitShibbolethLogin:
 async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
    async with session.post(url, data=data) as response:
        return soupify(await response.read())
 async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
    """
    aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected
    by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and
    build encoded URL objects ourselves... Who thought mangling location header was a good idea??
    """
    async with session.post(url, data=data, allow_redirects=False) as response:
        location = response.headers.get("location")
        if not location:
            raise CrawlWarning(f"Login failed (1), no location header present at {url}")
        correct_url = yarl.URL(location, encoded=True)
        async with session.get(correct_url, allow_redirects=False) as response:
            location = response.headers.get("location")
            # If shib still still has a valid session, it will directly respond to the request
            if location is None:
                return soupify(await response.read())
            as_yarl = yarl.URL(response.url)
            # Probably not needed anymore, but might catch a few weird situations with a nicer message
            if not location or not as_yarl.host:
                raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}")
            correct_url = yarl.URL.build(
                scheme=as_yarl.scheme,
                host=as_yarl.host,
                path=location,
                encoded=True
            )
            async with session.get(correct_url, allow_redirects=False) as response:
                return soupify(await response.read())
--- a/PFERD/deduplicator.py
+++ b/PFERD/deduplicator.py
@ -56,6 +56,12 @@ class Deduplicator:
            log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility")
        return new_path
    def fixup_path(self, path: PurePath) -> PurePath:
        """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise."""
        if self._windows_paths:
            return self._fixup_for_windows(path)
        return path
    def mark(self, path: PurePath) -> PurePath:
        if self._windows_paths:
            path = self._fixup_for_windows(path)
--- a/PFERD/logging.py
+++ b/PFERD/logging.py
@ -68,7 +68,7 @@ class Log:
        if self._download_progress.task_ids:
            elements.append(self._download_progress)
-        group = Group(*elements)  # type: ignore
+        group = Group(*elements)
        self._live.update(group)
    @contextmanager
--- a/PFERD/output_dir.py
+++ b/PFERD/output_dir.py
@ -503,7 +503,7 @@ class OutputDirectory:
        try:
            self._prev_report = Report.load(self._report_path)
            log.explain("Loaded report successfully")
-        except (OSError, json.JSONDecodeError, ReportLoadError) as e:
+        except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e:
            log.explain("Failed to load report")
            log.explain(str(e))
--- a/PFERD/report.py
+++ b/PFERD/report.py
@ -100,10 +100,10 @@ class Report:
    @classmethod
    def load(cls, path: Path) -> "Report":
        """
-        May raise OSError, JsonDecodeError, ReportLoadError.
+        May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError.
        """
-        with open(path) as f:
+        with open(path, encoding="utf-8") as f:
            data = json.load(f)
        if not isinstance(data, dict):
@ -148,7 +148,7 @@ class Report:
            "encountered_errors": self.encountered_errors,
        }
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, sort_keys=True)
            f.write("\n")  # json.dump doesn't do this
--- a/PFERD/version.py
+++ b/PFERD/version.py
@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.3.0"
+VERSION = "3.4.0"
--- a/README.md
+++ b/README.md
@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the
 ### With pip
-Ensure you have at least Python 3.8 installed. Run the following command to
+Ensure you have at least Python 3.9 installed. Run the following command to
 install PFERD or upgrade it to the latest version:
 ```
@ -26,6 +26,14 @@ $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest
 The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
 ### With package managers
 Unofficial packages are available for:
 - [AUR](https://aur.archlinux.org/packages/pferd)
 - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
 See also PFERD's [repology page](https://repology.org/project/pferd/versions).
 ## Basic usage
 PFERD can be run directly from the command line with no config file. Run `pferd
--- a/setup.cfg
+++ b/setup.cfg
@ -4,7 +4,7 @@ version = attr: PFERD.version.VERSION
 [options]
 packages = find:
-python_requires = >=3.8
+python_requires = >=3.9
 install_requires =
  aiohttp>=3.8.1
  beautifulsoup4>=4.10.0
Author	SHA1	Message	Date
Joscha	a241672726	Bump version to 3.4.0	2022-05-01 22:29:06 +02:00
Joscha	a8f76e9be7	Use utf-8 for credential file	2022-04-29 23:15:12 +02:00
Joscha	b56475450d	Use utf-8 for cookies	2022-04-29 23:12:41 +02:00
Joscha	aa74604d29	Use utf-8 for report	2022-04-29 23:11:27 +02:00
Joscha	d2e6d91880	Make PFERD executable via python -m	2022-04-27 22:52:50 +02:00
Joscha	602044ff1b	Fix mypy errors and add missing await	2022-04-27 22:52:50 +02:00
Joscha	31631fb409	Increase minimum python version to 3.9	2022-04-27 22:52:50 +02:00
I-Al-Istannen	00db348218	Update changelog	2022-04-27 22:03:52 +02:00
I-Al-Istannen	a709280cbf	Try to detect unsupported config file encoding The encoding detection is quite rudimentary, but should detect the default windows encoding in many cases.	2022-04-27 22:03:47 +02:00
I-Al-Istannen	a99ddaa0cc	Read and write config in UTF-8	2022-04-27 21:47:51 +02:00
Joscha	ba3d299c05	Fix changelog	2022-04-27 21:26:24 +02:00
Joscha	07a21f80a6	Link to unofficial packages	2022-04-27 21:15:33 +02:00
I-Al-Istannen	f17b9b68f4	Add shibboleth authentication fix to changelog	2022-04-27 14:01:40 +02:00
I-Al-Istannen	a2831fbea2	Fix shib authentication Authentication failed previously if the shib session was still valid. If Shibboleth gets a request and the session is still valid, it directly responds without a second redirect.	2022-04-27 13:55:24 +02:00
I-Al-Istannen	da72863b47	Placate newer mypy	2022-04-03 13:19:08 +02:00
I-Al-Istannen	86e2e226dc	Notify user when shibboleth presents new entitlements	2022-04-03 11:37:08 +02:00
I-Al-Istannen	7872fe5221	Fix tables with more columns than expected	2022-01-18 22:38:48 +01:00
Joscha	86947e4874	Bump version to 3.3.1	2022-01-15 15:11:22 +01:00
Joscha	4f022e2d19	Reword changelog	2022-01-15 15:06:02 +01:00
I-Al-Istannen	f47e7374d2	Use fixed windows path for video cache	2022-01-15 12:00:30 +01:00
I-Al-Istannen	57ec51e95a	Fix login after shib url parser change	2022-01-14 20:17:27 +01:00
`@ -1,2 +1,2 @@`
	`NAME = "PFERD"`	`NAME = "PFERD"`
	`VERSION = "3.3.0"`	`VERSION = "3.4.0"`