mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Compare commits
17 Commits
Author | SHA1 | Date | |
---|---|---|---|
a241672726 | |||
a8f76e9be7 | |||
b56475450d | |||
aa74604d29 | |||
d2e6d91880 | |||
602044ff1b | |||
31631fb409 | |||
00db348218 | |||
a709280cbf | |||
a99ddaa0cc | |||
ba3d299c05 | |||
07a21f80a6 | |||
f17b9b68f4 | |||
a2831fbea2 | |||
da72863b47 | |||
86e2e226dc | |||
7872fe5221 |
2
.github/workflows/build-and-release.yml
vendored
2
.github/workflows/build-and-release.yml
vendored
@ -14,7 +14,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
python: ["3.8"]
|
python: ["3.9"]
|
||||||
steps:
|
steps:
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
14
CHANGELOG.md
14
CHANGELOG.md
@ -22,6 +22,20 @@ ambiguous situations.
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
## 3.4.0 - 2022-05-01
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Message when Shibboleth entitlements need to be manually reviewed
|
||||||
|
- Links to unofficial packages and repology in the readme
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Increase minimum supported Python version to 3.9
|
||||||
|
- Support video listings with more columns
|
||||||
|
- Use UTF-8 when reading/writing the config file
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Crash during authentication when the Shibboleth session is still valid
|
||||||
|
|
||||||
## 3.3.1 - 2022-01-15
|
## 3.3.1 - 2022-01-15
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
@ -159,3 +159,7 @@ def main() -> None:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
pferd.print_report()
|
pferd.print_report()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
@ -20,8 +20,10 @@ class CredentialFileAuthenticator(Authenticator):
|
|||||||
|
|
||||||
path = config.default_section.working_dir() / section.path()
|
path = config.default_section.working_dir() / section.path()
|
||||||
try:
|
try:
|
||||||
with open(path) as f:
|
with open(path, encoding="utf-8") as f:
|
||||||
lines = list(f)
|
lines = list(f)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8")
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e
|
raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ class Config:
|
|||||||
# Using config.read_file instead of config.read because config.read
|
# Using config.read_file instead of config.read because config.read
|
||||||
# would just ignore a missing file and carry on.
|
# would just ignore a missing file and carry on.
|
||||||
try:
|
try:
|
||||||
with open(path) as f:
|
with open(path, encoding="utf-8") as f:
|
||||||
parser.read_file(f, source=str(path))
|
parser.read_file(f, source=str(path))
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise ConfigLoadError(path, "File does not exist")
|
raise ConfigLoadError(path, "File does not exist")
|
||||||
@ -128,6 +128,8 @@ class Config:
|
|||||||
raise ConfigLoadError(path, "That's a directory, not a file")
|
raise ConfigLoadError(path, "That's a directory, not a file")
|
||||||
except PermissionError:
|
except PermissionError:
|
||||||
raise ConfigLoadError(path, "Insufficient permissions")
|
raise ConfigLoadError(path, "Insufficient permissions")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
raise ConfigLoadError(path, "File is not encoded using UTF-8")
|
||||||
|
|
||||||
def dump(self, path: Optional[Path] = None) -> None:
|
def dump(self, path: Optional[Path] = None) -> None:
|
||||||
"""
|
"""
|
||||||
@ -154,12 +156,12 @@ class Config:
|
|||||||
try:
|
try:
|
||||||
# x = open for exclusive creation, failing if the file already
|
# x = open for exclusive creation, failing if the file already
|
||||||
# exists
|
# exists
|
||||||
with open(path, "x") as f:
|
with open(path, "x", encoding="utf-8") as f:
|
||||||
self._parser.write(f)
|
self._parser.write(f)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
print("That file already exists.")
|
print("That file already exists.")
|
||||||
if asyncio.run(prompt_yes_no("Overwrite it?", default=False)):
|
if asyncio.run(prompt_yes_no("Overwrite it?", default=False)):
|
||||||
with open(path, "w") as f:
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
self._parser.write(f)
|
self._parser.write(f)
|
||||||
else:
|
else:
|
||||||
raise ConfigDumpError(path, "File already exists")
|
raise ConfigDumpError(path, "File already exists")
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Awaitable, Coroutine
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
|
||||||
|
|
||||||
from ..auth import Authenticator
|
from ..auth import Authenticator
|
||||||
from ..config import Config, Section
|
from ..config import Config, Section
|
||||||
@ -58,7 +59,7 @@ def noncritical(f: Wrapped) -> Wrapped:
|
|||||||
return wrapper # type: ignore
|
return wrapper # type: ignore
|
||||||
|
|
||||||
|
|
||||||
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
|
AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]])
|
||||||
|
|
||||||
|
|
||||||
def anoncritical(f: AWrapped) -> AWrapped:
|
def anoncritical(f: AWrapped) -> AWrapped:
|
||||||
|
@ -108,7 +108,7 @@ class HttpCrawler(Crawler):
|
|||||||
|
|
||||||
def _load_cookies_from_file(self, path: Path) -> None:
|
def _load_cookies_from_file(self, path: Path) -> None:
|
||||||
jar: Any = http.cookies.SimpleCookie()
|
jar: Any = http.cookies.SimpleCookie()
|
||||||
with open(path) as f:
|
with open(path, encoding="utf-8") as f:
|
||||||
for i, line in enumerate(f):
|
for i, line in enumerate(f):
|
||||||
# Names of headers are case insensitive
|
# Names of headers are case insensitive
|
||||||
if line[:11].lower() == "set-cookie:":
|
if line[:11].lower() == "set-cookie:":
|
||||||
@ -121,7 +121,7 @@ class HttpCrawler(Crawler):
|
|||||||
jar: Any = http.cookies.SimpleCookie()
|
jar: Any = http.cookies.SimpleCookie()
|
||||||
for morsel in self._cookie_jar:
|
for morsel in self._cookie_jar:
|
||||||
jar[morsel.key] = morsel
|
jar[morsel.key] = morsel
|
||||||
with open(path, "w") as f:
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
f.write(jar.output(sep="\n"))
|
f.write(jar.output(sep="\n"))
|
||||||
f.write("\n") # A trailing newline is just common courtesy
|
f.write("\n") # A trailing newline is just common courtesy
|
||||||
|
|
||||||
|
@ -280,11 +280,22 @@ class IliasPage:
|
|||||||
|
|
||||||
def _listed_video_to_element(self, link: Tag) -> IliasPageElement:
|
def _listed_video_to_element(self, link: Tag) -> IliasPageElement:
|
||||||
# The link is part of a table with multiple columns, describing metadata.
|
# The link is part of a table with multiple columns, describing metadata.
|
||||||
# 6th child (1 indexed) is the modification time string
|
# 6th or 7th child (1 indexed) is the modification time string. Try to find it
|
||||||
|
# by parsing backwards from the end and finding something that looks like a date
|
||||||
|
modification_time = None
|
||||||
|
row: Tag = link.parent.parent.parent
|
||||||
|
column_count = len(row.select("td.std"))
|
||||||
|
for index in range(column_count, 0, -1):
|
||||||
modification_string = link.parent.parent.parent.select_one(
|
modification_string = link.parent.parent.parent.select_one(
|
||||||
"td.std:nth-child(6)"
|
f"td.std:nth-child({index})"
|
||||||
).getText().strip()
|
).getText().strip()
|
||||||
|
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
|
||||||
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
|
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
|
||||||
|
break
|
||||||
|
|
||||||
|
if modification_time is None:
|
||||||
|
log.warn(f"Could not determine upload time for {link}")
|
||||||
|
modification_time = datetime.now()
|
||||||
|
|
||||||
title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip()
|
title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip()
|
||||||
title += ".mp4"
|
title += ".mp4"
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
|
from collections.abc import Awaitable, Coroutine
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast
|
from typing import Any, Callable, Dict, List, Optional, Set, Union, cast
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import yarl
|
import yarl
|
||||||
@ -13,7 +14,7 @@ from ...config import Config
|
|||||||
from ...logging import ProgressBar, log
|
from ...logging import ProgressBar, log
|
||||||
from ...output_dir import FileSink, Redownload
|
from ...output_dir import FileSink, Redownload
|
||||||
from ...utils import fmt_path, soupify, url_set_query_param
|
from ...utils import fmt_path, soupify, url_set_query_param
|
||||||
from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
from .file_templates import Links
|
from .file_templates import Links
|
||||||
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
|
||||||
@ -82,8 +83,6 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([
|
|||||||
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
|
IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
|
||||||
])
|
])
|
||||||
|
|
||||||
AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])
|
|
||||||
|
|
||||||
|
|
||||||
def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
|
def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
|
||||||
def decorator(f: AWrapped) -> AWrapped:
|
def decorator(f: AWrapped) -> AWrapped:
|
||||||
@ -252,7 +251,7 @@ instance's greatest bottleneck.
|
|||||||
url: str,
|
url: str,
|
||||||
parent: IliasPageElement,
|
parent: IliasPageElement,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
) -> Optional[Awaitable[None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
maybe_cl = await self.crawl(path)
|
maybe_cl = await self.crawl(path)
|
||||||
if not maybe_cl:
|
if not maybe_cl:
|
||||||
return None
|
return None
|
||||||
@ -310,7 +309,7 @@ instance's greatest bottleneck.
|
|||||||
self,
|
self,
|
||||||
parent_path: PurePath,
|
parent_path: PurePath,
|
||||||
element: IliasPageElement,
|
element: IliasPageElement,
|
||||||
) -> Optional[Awaitable[None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
if element.url in self._visited_urls:
|
if element.url in self._visited_urls:
|
||||||
raise CrawlWarning(
|
raise CrawlWarning(
|
||||||
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
|
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
|
||||||
@ -360,7 +359,7 @@ instance's greatest bottleneck.
|
|||||||
self,
|
self,
|
||||||
element: IliasPageElement,
|
element: IliasPageElement,
|
||||||
element_path: PurePath,
|
element_path: PurePath,
|
||||||
) -> Optional[Awaitable[None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
|
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
|
||||||
log.explain(f"Links type is {self._links}")
|
log.explain(f"Links type is {self._links}")
|
||||||
|
|
||||||
@ -407,7 +406,7 @@ instance's greatest bottleneck.
|
|||||||
self,
|
self,
|
||||||
element: IliasPageElement,
|
element: IliasPageElement,
|
||||||
element_path: PurePath,
|
element_path: PurePath,
|
||||||
) -> Optional[Awaitable[None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}")
|
log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}")
|
||||||
log.explain(f"Links type is {self._links}")
|
log.explain(f"Links type is {self._links}")
|
||||||
|
|
||||||
@ -443,7 +442,7 @@ instance's greatest bottleneck.
|
|||||||
if hdrs.LOCATION not in resp.headers:
|
if hdrs.LOCATION not in resp.headers:
|
||||||
return soupify(await resp.read()).select_one("a").get("href").strip()
|
return soupify(await resp.read()).select_one("a").get("href").strip()
|
||||||
|
|
||||||
self._authenticate()
|
await self._authenticate()
|
||||||
|
|
||||||
async with self.session.get(export_url, allow_redirects=False) as resp:
|
async with self.session.get(export_url, allow_redirects=False) as resp:
|
||||||
# No redirect means we were authenticated
|
# No redirect means we were authenticated
|
||||||
@ -456,7 +455,7 @@ instance's greatest bottleneck.
|
|||||||
self,
|
self,
|
||||||
element: IliasPageElement,
|
element: IliasPageElement,
|
||||||
element_path: PurePath,
|
element_path: PurePath,
|
||||||
) -> Optional[Awaitable[None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
# Copy old mapping as it is likely still relevant
|
# Copy old mapping as it is likely still relevant
|
||||||
if self.prev_report:
|
if self.prev_report:
|
||||||
self.report.add_custom_value(
|
self.report.add_custom_value(
|
||||||
@ -564,7 +563,7 @@ instance's greatest bottleneck.
|
|||||||
self,
|
self,
|
||||||
element: IliasPageElement,
|
element: IliasPageElement,
|
||||||
element_path: PurePath,
|
element_path: PurePath,
|
||||||
) -> Optional[Awaitable[None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
return None
|
return None
|
||||||
@ -710,6 +709,12 @@ class KitShibbolethLogin:
|
|||||||
}
|
}
|
||||||
soup = await _post(sess, url, data)
|
soup = await _post(sess, url, data)
|
||||||
|
|
||||||
|
if soup.find(id="attributeRelease"):
|
||||||
|
raise CrawlError(
|
||||||
|
"ILIAS Shibboleth entitlements changed! "
|
||||||
|
"Please log in once in your browser and review them"
|
||||||
|
)
|
||||||
|
|
||||||
if self._tfa_required(soup):
|
if self._tfa_required(soup):
|
||||||
soup = await self._authenticate_tfa(sess, soup)
|
soup = await self._authenticate_tfa(sess, soup)
|
||||||
|
|
||||||
@ -778,15 +783,19 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea
|
|||||||
async with session.post(url, data=data, allow_redirects=False) as response:
|
async with session.post(url, data=data, allow_redirects=False) as response:
|
||||||
location = response.headers.get("location")
|
location = response.headers.get("location")
|
||||||
if not location:
|
if not location:
|
||||||
raise CrawlWarning(f"Login failed, no location header present at {url}")
|
raise CrawlWarning(f"Login failed (1), no location header present at {url}")
|
||||||
correct_url = yarl.URL(location, encoded=True)
|
correct_url = yarl.URL(location, encoded=True)
|
||||||
|
|
||||||
async with session.get(correct_url, allow_redirects=False) as response:
|
async with session.get(correct_url, allow_redirects=False) as response:
|
||||||
as_yarl = yarl.URL(response.url)
|
|
||||||
location = response.headers.get("location")
|
location = response.headers.get("location")
|
||||||
|
# If shib still still has a valid session, it will directly respond to the request
|
||||||
|
if location is None:
|
||||||
|
return soupify(await response.read())
|
||||||
|
|
||||||
|
as_yarl = yarl.URL(response.url)
|
||||||
|
# Probably not needed anymore, but might catch a few weird situations with a nicer message
|
||||||
if not location or not as_yarl.host:
|
if not location or not as_yarl.host:
|
||||||
raise CrawlWarning(f"Login failed, no location header present at {correct_url}")
|
raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}")
|
||||||
|
|
||||||
correct_url = yarl.URL.build(
|
correct_url = yarl.URL.build(
|
||||||
scheme=as_yarl.scheme,
|
scheme=as_yarl.scheme,
|
||||||
|
@ -68,7 +68,7 @@ class Log:
|
|||||||
if self._download_progress.task_ids:
|
if self._download_progress.task_ids:
|
||||||
elements.append(self._download_progress)
|
elements.append(self._download_progress)
|
||||||
|
|
||||||
group = Group(*elements) # type: ignore
|
group = Group(*elements)
|
||||||
self._live.update(group)
|
self._live.update(group)
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
@ -503,7 +503,7 @@ class OutputDirectory:
|
|||||||
try:
|
try:
|
||||||
self._prev_report = Report.load(self._report_path)
|
self._prev_report = Report.load(self._report_path)
|
||||||
log.explain("Loaded report successfully")
|
log.explain("Loaded report successfully")
|
||||||
except (OSError, json.JSONDecodeError, ReportLoadError) as e:
|
except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e:
|
||||||
log.explain("Failed to load report")
|
log.explain("Failed to load report")
|
||||||
log.explain(str(e))
|
log.explain(str(e))
|
||||||
|
|
||||||
|
@ -100,10 +100,10 @@ class Report:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path: Path) -> "Report":
|
def load(cls, path: Path) -> "Report":
|
||||||
"""
|
"""
|
||||||
May raise OSError, JsonDecodeError, ReportLoadError.
|
May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with open(path) as f:
|
with open(path, encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
if not isinstance(data, dict):
|
if not isinstance(data, dict):
|
||||||
@ -148,7 +148,7 @@ class Report:
|
|||||||
"encountered_errors": self.encountered_errors,
|
"encountered_errors": self.encountered_errors,
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(path, "w") as f:
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
json.dump(data, f, indent=2, sort_keys=True)
|
json.dump(data, f, indent=2, sort_keys=True)
|
||||||
f.write("\n") # json.dump doesn't do this
|
f.write("\n") # json.dump doesn't do this
|
||||||
|
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
NAME = "PFERD"
|
NAME = "PFERD"
|
||||||
VERSION = "3.3.1"
|
VERSION = "3.4.0"
|
||||||
|
10
README.md
10
README.md
@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the
|
|||||||
|
|
||||||
### With pip
|
### With pip
|
||||||
|
|
||||||
Ensure you have at least Python 3.8 installed. Run the following command to
|
Ensure you have at least Python 3.9 installed. Run the following command to
|
||||||
install PFERD or upgrade it to the latest version:
|
install PFERD or upgrade it to the latest version:
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -26,6 +26,14 @@ $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest
|
|||||||
|
|
||||||
The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
|
The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
|
||||||
|
|
||||||
|
### With package managers
|
||||||
|
|
||||||
|
Unofficial packages are available for:
|
||||||
|
- [AUR](https://aur.archlinux.org/packages/pferd)
|
||||||
|
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
||||||
|
|
||||||
|
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
||||||
|
|
||||||
## Basic usage
|
## Basic usage
|
||||||
|
|
||||||
PFERD can be run directly from the command line with no config file. Run `pferd
|
PFERD can be run directly from the command line with no config file. Run `pferd
|
||||||
|
Reference in New Issue
Block a user