Compare commits

..

17 Commits

Author SHA1 Message Date
a241672726 Bump version to 3.4.0 2022-05-01 22:29:06 +02:00
a8f76e9be7 Use utf-8 for credential file 2022-04-29 23:15:12 +02:00
b56475450d Use utf-8 for cookies 2022-04-29 23:12:41 +02:00
aa74604d29 Use utf-8 for report 2022-04-29 23:11:27 +02:00
d2e6d91880 Make PFERD executable via python -m 2022-04-27 22:52:50 +02:00
602044ff1b Fix mypy errors and add missing await 2022-04-27 22:52:50 +02:00
31631fb409 Increase minimum python version to 3.9 2022-04-27 22:52:50 +02:00
00db348218 Update changelog 2022-04-27 22:03:52 +02:00
a709280cbf Try to detect unsupported config file encoding
The encoding detection is quite rudimentary, but should detect the
default windows encoding in many cases.
2022-04-27 22:03:47 +02:00
a99ddaa0cc Read and write config in UTF-8 2022-04-27 21:47:51 +02:00
ba3d299c05 Fix changelog 2022-04-27 21:26:24 +02:00
07a21f80a6 Link to unofficial packages 2022-04-27 21:15:33 +02:00
f17b9b68f4 Add shibboleth authentication fix to changelog 2022-04-27 14:01:40 +02:00
a2831fbea2 Fix shib authentication
Authentication failed previously if the shib session was still valid.
If Shibboleth gets a request and the session is still valid, it directly
responds without a second redirect.
2022-04-27 13:55:24 +02:00
da72863b47 Placate newer mypy 2022-04-03 13:19:08 +02:00
86e2e226dc Notify user when shibboleth presents new entitlements 2022-04-03 11:37:08 +02:00
7872fe5221 Fix tables with more columns than expected 2022-01-18 22:38:48 +01:00
15 changed files with 87 additions and 36 deletions

View File

@ -14,7 +14,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python: ["3.8"]
python: ["3.9"]
steps:
- uses: actions/checkout@v2

View File

@ -22,6 +22,20 @@ ambiguous situations.
## Unreleased
## 3.4.0 - 2022-05-01
### Added
- Message when Shibboleth entitlements need to be manually reviewed
- Links to unofficial packages and repology in the readme
### Changed
- Increase minimum supported Python version to 3.9
- Support video listings with more columns
- Use UTF-8 when reading/writing the config file
### Fixed
- Crash during authentication when the Shibboleth session is still valid
## 3.3.1 - 2022-01-15
### Fixed

View File

@ -159,3 +159,7 @@ def main() -> None:
sys.exit(1)
else:
pferd.print_report()
if __name__ == "__main__":
main()

View File

@ -20,8 +20,10 @@ class CredentialFileAuthenticator(Authenticator):
path = config.default_section.working_dir() / section.path()
try:
with open(path) as f:
with open(path, encoding="utf-8") as f:
lines = list(f)
except UnicodeDecodeError:
raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8")
except OSError as e:
raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e

View File

@ -120,7 +120,7 @@ class Config:
# Using config.read_file instead of config.read because config.read
# would just ignore a missing file and carry on.
try:
with open(path) as f:
with open(path, encoding="utf-8") as f:
parser.read_file(f, source=str(path))
except FileNotFoundError:
raise ConfigLoadError(path, "File does not exist")
@ -128,6 +128,8 @@ class Config:
raise ConfigLoadError(path, "That's a directory, not a file")
except PermissionError:
raise ConfigLoadError(path, "Insufficient permissions")
except UnicodeDecodeError:
raise ConfigLoadError(path, "File is not encoded using UTF-8")
def dump(self, path: Optional[Path] = None) -> None:
"""
@ -154,12 +156,12 @@ class Config:
try:
# x = open for exclusive creation, failing if the file already
# exists
with open(path, "x") as f:
with open(path, "x", encoding="utf-8") as f:
self._parser.write(f)
except FileExistsError:
print("That file already exists.")
if asyncio.run(prompt_yes_no("Overwrite it?", default=False)):
with open(path, "w") as f:
with open(path, "w", encoding="utf-8") as f:
self._parser.write(f)
else:
raise ConfigDumpError(path, "File already exists")

View File

@ -1,9 +1,10 @@
import asyncio
import os
from abc import ABC, abstractmethod
from collections.abc import Awaitable, Coroutine
from datetime import datetime
from pathlib import Path, PurePath
from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
from ..auth import Authenticator
from ..config import Config, Section
@ -58,7 +59,7 @@ def noncritical(f: Wrapped) -> Wrapped:
return wrapper # type: ignore
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]])
def anoncritical(f: AWrapped) -> AWrapped:

View File

@ -108,7 +108,7 @@ class HttpCrawler(Crawler):
def _load_cookies_from_file(self, path: Path) -> None:
jar: Any = http.cookies.SimpleCookie()
with open(path) as f:
with open(path, encoding="utf-8") as f:
for i, line in enumerate(f):
# Names of headers are case insensitive
if line[:11].lower() == "set-cookie:":
@ -121,7 +121,7 @@ class HttpCrawler(Crawler):
jar: Any = http.cookies.SimpleCookie()
for morsel in self._cookie_jar:
jar[morsel.key] = morsel
with open(path, "w") as f:
with open(path, "w", encoding="utf-8") as f:
f.write(jar.output(sep="\n"))
f.write("\n") # A trailing newline is just common courtesy

View File

@ -280,11 +280,22 @@ class IliasPage:
def _listed_video_to_element(self, link: Tag) -> IliasPageElement:
# The link is part of a table with multiple columns, describing metadata.
# 6th child (1 indexed) is the modification time string
# 6th or 7th child (1 indexed) is the modification time string. Try to find it
# by parsing backwards from the end and finding something that looks like a date
modification_time = None
row: Tag = link.parent.parent.parent
column_count = len(row.select("td.std"))
for index in range(column_count, 0, -1):
modification_string = link.parent.parent.parent.select_one(
"td.std:nth-child(6)"
f"td.std:nth-child({index})"
).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
break
if modification_time is None:
log.warn(f"Could not determine upload time for {link}")
modification_time = datetime.now()
title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip()
title += ".mp4"

View File

@ -1,7 +1,8 @@
import asyncio
import re
from collections.abc import Awaitable, Coroutine
from pathlib import PurePath
from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast
from typing import Any, Callable, Dict, List, Optional, Set, Union, cast
import aiohttp
import yarl
@ -13,7 +14,7 @@ from ...config import Config
from ...logging import ProgressBar, log
from ...output_dir import FileSink, Redownload
from ...utils import fmt_path, soupify, url_set_query_param
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
from ..http_crawler import HttpCrawler, HttpCrawlerSection
from .file_templates import Links
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
@ -82,8 +83,6 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
])
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
def decorator(f: AWrapped) -> AWrapped:
@ -252,7 +251,7 @@ instance's greatest bottleneck.
url: str,
parent: IliasPageElement,
path: PurePath,
) -> Optional[Awaitable[None]]:
) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path)
if not maybe_cl:
return None
@ -310,7 +309,7 @@ instance's greatest bottleneck.
self,
parent_path: PurePath,
element: IliasPageElement,
) -> Optional[Awaitable[None]]:
) -> Optional[Coroutine[Any, Any, None]]:
if element.url in self._visited_urls:
raise CrawlWarning(
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
@ -360,7 +359,7 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> Optional[Awaitable[None]]:
) -> Optional[Coroutine[Any, Any, None]]:
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}")
@ -407,7 +406,7 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> Optional[Awaitable[None]]:
) -> Optional[Coroutine[Any, Any, None]]:
log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}")
@ -443,7 +442,7 @@ instance's greatest bottleneck.
if hdrs.LOCATION not in resp.headers:
return soupify(await resp.read()).select_one("a").get("href").strip()
self._authenticate()
await self._authenticate()
async with self.session.get(export_url, allow_redirects=False) as resp:
# No redirect means we were authenticated
@ -456,7 +455,7 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> Optional[Awaitable[None]]:
) -> Optional[Coroutine[Any, Any, None]]:
# Copy old mapping as it is likely still relevant
if self.prev_report:
self.report.add_custom_value(
@ -564,7 +563,7 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> Optional[Awaitable[None]]:
) -> Optional[Coroutine[Any, Any, None]]:
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
return None
@ -710,6 +709,12 @@ class KitShibbolethLogin:
}
soup = await _post(sess, url, data)
if soup.find(id="attributeRelease"):
raise CrawlError(
"ILIAS Shibboleth entitlements changed! "
"Please log in once in your browser and review them"
)
if self._tfa_required(soup):
soup = await self._authenticate_tfa(sess, soup)
@ -778,15 +783,19 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea
async with session.post(url, data=data, allow_redirects=False) as response:
location = response.headers.get("location")
if not location:
raise CrawlWarning(f"Login failed, no location header present at {url}")
raise CrawlWarning(f"Login failed (1), no location header present at {url}")
correct_url = yarl.URL(location, encoded=True)
async with session.get(correct_url, allow_redirects=False) as response:
as_yarl = yarl.URL(response.url)
location = response.headers.get("location")
# If shib still still has a valid session, it will directly respond to the request
if location is None:
return soupify(await response.read())
as_yarl = yarl.URL(response.url)
# Probably not needed anymore, but might catch a few weird situations with a nicer message
if not location or not as_yarl.host:
raise CrawlWarning(f"Login failed, no location header present at {correct_url}")
raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}")
correct_url = yarl.URL.build(
scheme=as_yarl.scheme,

View File

@ -68,7 +68,7 @@ class Log:
if self._download_progress.task_ids:
elements.append(self._download_progress)
group = Group(*elements) # type: ignore
group = Group(*elements)
self._live.update(group)
@contextmanager

View File

@ -503,7 +503,7 @@ class OutputDirectory:
try:
self._prev_report = Report.load(self._report_path)
log.explain("Loaded report successfully")
except (OSError, json.JSONDecodeError, ReportLoadError) as e:
except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e:
log.explain("Failed to load report")
log.explain(str(e))

View File

@ -100,10 +100,10 @@ class Report:
@classmethod
def load(cls, path: Path) -> "Report":
"""
May raise OSError, JsonDecodeError, ReportLoadError.
May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError.
"""
with open(path) as f:
with open(path, encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, dict):
@ -148,7 +148,7 @@ class Report:
"encountered_errors": self.encountered_errors,
}
with open(path, "w") as f:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, sort_keys=True)
f.write("\n") # json.dump doesn't do this

View File

@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.3.1"
VERSION = "3.4.0"

View File

@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the
### With pip
Ensure you have at least Python 3.8 installed. Run the following command to
Ensure you have at least Python 3.9 installed. Run the following command to
install PFERD or upgrade it to the latest version:
```
@ -26,6 +26,14 @@ $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest
The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
### With package managers
Unofficial packages are available for:
- [AUR](https://aur.archlinux.org/packages/pferd)
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
## Basic usage
PFERD can be run directly from the command line with no config file. Run `pferd

View File

@ -4,7 +4,7 @@ version = attr: PFERD.version.VERSION
[options]
packages = find:
python_requires = >=3.8
python_requires = >=3.9
install_requires =
aiohttp>=3.8.1
beautifulsoup4>=4.10.0