Compare commits

...

36 Commits

Author SHA1 Message Date
d7f2229978 Bump version to 3.6.0 2024-10-23 20:17:47 +02:00
52fdeae752 Crawl custom item groups as folders 2024-10-21 23:43:48 +02:00
f9bb2e41cf Sanitize slashes in exercise container names 2024-10-21 22:30:16 +02:00
4f9e2ab48d Support named capture groups in regex transformers (#94) 2024-10-21 15:21:33 +02:00
19beb8f07b Document course overview downloading in config.md 2024-07-31 22:02:43 +02:00
c897d9e2f5 Support finding entries for course overview page
Related to issue #93
2024-06-26 16:54:07 +02:00
21a266e302 Update upload-artifact action to v4
https://github.com/actions/upload-artifact/blob/main/docs/MIGRATION.md#multiple-uploads-to-the-same-named-artifact
2024-05-11 16:33:14 +02:00
b29b6f93f8 run ci twice
Co-authored-by: Garmelon <joscha@plugh.de>
2024-05-11 16:09:46 +02:00
318226d7cb fix bump-version script 2024-05-11 10:27:54 +02:00
422cf05f15 Move all configuration into pyproject.toml, add x86 mac to CI 2024-05-11 10:26:19 +02:00
819c6673c7 Update changelog 2024-05-10 14:40:25 +02:00
89b44c69a7 Update docs
All config file options must be documented in CONFIG.md. The README.md
is just a starting point. To avoid duplicated info, I've moved most of
the docs to CONFIG.md.
2024-05-10 14:36:01 +02:00
4b4f72b2ca Fix command name 2024-05-10 14:34:20 +02:00
778517d8c6 Fix KIT crawler requiring base_url and client_id options 2024-05-10 14:12:45 +02:00
428b0179fc Remove IliasConfig
Also uses urljoin() in a few places that previously used string
concatenation or fstrings.

At this point, there isn't yet a need for IliasConfig, so I'd rather
keep the code base simpler and more consistent. Should we need a
structure like IliasConfig in the future (maybe because we have a few
more ilias parsers), it's easy to add back.
2024-05-10 14:09:14 +02:00
ade6309dd9 Update copyright information 2024-05-05 02:34:26 +02:00
fd6cb7b966 docs: Remove some filler words 2024-05-05 02:34:00 +02:00
5c87517ceb docs: Explain usage with generic ilias 2024-05-04 17:52:12 +02:00
b01f093474 fix: Element detection for other universities
Other universities might use other URL schemes
for different element types
2024-05-04 17:52:06 +02:00
3a05b90525 fix circular import for _io_repeat 2024-05-04 17:51:59 +02:00
7a00f73e0e feat: Add authentication to generic ilias dl 2024-05-04 17:51:38 +02:00
5d0621420e feat: Generic ilias_web command 2024-05-04 17:44:37 +02:00
df98153169 refactor: Extract generic settings from ilias command
Preparation for generic ilias_web command
2024-05-04 17:44:30 +02:00
fc1f68ccd9 refactor: Separate generic and KIT ilias functions 2024-05-04 17:44:18 +02:00
3e831c7e23 Fix normalization of meeting names in cards 2024-04-24 22:32:26 +02:00
bbcfe9c8dd Fix typo in CONFIG.md (#89) 2024-04-19 16:52:18 +02:00
eb01aa86cb Bump version to 3.5.2 2024-04-14 12:10:17 +02:00
3db186a978 Fix personal desktop crawling HTML warnings 2024-04-10 11:15:25 +02:00
4a5959fd58 Fix personal desktop crawling without favorites 2024-04-10 11:15:25 +02:00
1cbc2b717a Fix personal desktop crawling with ILIAS 8 2024-04-10 01:20:37 +02:00
da627ff929 Bump version to 3.5.1 2024-04-09 14:28:56 +02:00
c1b592ac29 Fix ILIAS 8 file downloads truncating to zero bytes 2024-04-08 17:59:41 +02:00
eb0c956d32 Add compatibility with ILIAS 8 2024-04-05 19:08:05 +02:00
ab0cb2d956 nix: bump nixpgs dependency 2024-02-27 23:39:53 +01:00
a117126389 Fix video name deduplication 2023-12-09 23:08:42 +01:00
e9f8901520 Fix typos in ilias crawler and use set literals 2023-11-30 20:57:57 +01:00
27 changed files with 1603 additions and 1186 deletions

10
.github/dependabot.yml vendored Normal file
View File

@ -0,0 +1,10 @@
version: 2
updates:
- package-ecosystem: github-actions
directory: /
schedule:
interval: monthly
groups:
gh-actions:
patterns:
- "*"

View File

@ -1,6 +1,6 @@
name: build-and-release
on: push
on: [push, pull_request]
defaults:
run:
@ -13,13 +13,12 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
os: [ubuntu-latest, windows-latest, macos-13, macos-latest]
python: ["3.9"]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
@ -34,7 +33,12 @@ jobs:
run: ./scripts/setup --no-pip
- name: Run checks
run: ./scripts/check
run: |
./scripts/check
./scripts/format
- name: Assert no changes
run: git diff --exit-code
- name: Build
run: ./scripts/build
@ -45,9 +49,9 @@ jobs:
run: mv dist/pferd* dist/pferd-${{ matrix.os }}
- name: Upload binary
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: Binaries
name: pferd-${{ matrix.os }}
path: dist/pferd-${{ matrix.os }}
release:
@ -57,18 +61,20 @@ jobs:
steps:
- name: Download binaries
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: Binaries
pattern: pferd-*
merge-multiple: true
- name: Rename binaries
run: |
mv pferd-ubuntu-latest pferd-linux
mv pferd-windows-latest pferd-windows.exe
mv pferd-macos-13 pferd-mac-x86_64
mv pferd-macos-latest pferd-mac
- name: Create release
uses: softprops/action-gh-release@v1
uses: softprops/action-gh-release@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
@ -76,3 +82,4 @@ jobs:
pferd-linux
pferd-windows.exe
pferd-mac
pferd-mac-x86_64

View File

@ -22,6 +22,33 @@ ambiguous situations.
## Unreleased
## 3.6.0 - 2024-10-23
### Added
- Generic `ilias-web` crawler and `ilias-web` CLI command
- Support for the course overview page. Using this URL as a target might cause
duplication warnings, as subgroups are listed separately.
- Support for named capture groups in regex transforms
- Crawl custom item groups as folders
### Fixed
- Normalization of meeting names in cards
- Sanitization of slashes in exercise container names
## 3.5.2 - 2024-04-14
### Fixed
- Crawling of personal desktop with ILIAS 8
- Crawling of empty personal desktops
## 3.5.1 - 2024-04-09
### Added
- Support for ILIAS 8
### Fixed
- Video name deduplication
## 3.5.0 - 2023-09-13
### Added

View File

@ -4,11 +4,11 @@ A config file consists of sections. A section begins with a `[section]` header,
which is followed by a list of `key = value` pairs. Comments must be on their
own line and start with `#`. Multiline values must be indented beyond their key.
Boolean values can be `yes` or `no`. For more details and some examples on the
format, see the [configparser documentation][1] ([interpolation][2] is
disabled).
format, see the [configparser documentation][cp-file]
([interpolation][cp-interp] is disabled).
[1]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure"
[2]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values"
[cp-file]: <https://docs.python.org/3/library/configparser.html#supported-ini-file-structure> "Supported INI File Structure"
[cp-interp]: <https://docs.python.org/3/library/configparser.html#interpolation-of-values> "Interpolation of values"
## The `DEFAULT` section
@ -146,7 +146,7 @@ crawler simulate a slower, network-based crawler.
This crawler crawls a KIT-IPD page by url. The root page can be crawled from
outside the KIT network so you will be informed about any new/deleted files,
but downloading files requires you to be within. Adding a show delay between
but downloading files requires you to be within. Adding a short delay between
requests is likely a good idea.
- `target`: URL to a KIT-IPD page
@ -154,6 +154,56 @@ requests is likely a good idea.
matches, the given link is downloaded as a file. This is used to extract
files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`)
### The `ilias-web` crawler
This crawler crawls a generic ILIAS instance.
Inspired by [this ILIAS downloader][ilias-dl], the following configurations should work
out of the box for the corresponding universities:
[ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs"
| University | `base_url` | `client_id` |
|---------------|--------------------------------------|---------------|
| FH Aachen | https://www.ili.fh-aachen.de | elearning |
| Uni Köln | https://www.ilias.uni-koeln.de/ilias | uk |
| Uni Konstanz | https://ilias.uni-konstanz.de | ILIASKONSTANZ |
| Uni Stuttgart | https://ilias3.uni-stuttgart.de | Uni_Stuttgart |
If your university isn't listed, try navigating to your instance's login page.
Assuming no custom login service is used, the URL will look something like this:
```jinja
{{ base_url }}/login.php?client_id={{ client_id }}&cmd=force_login&lang=
```
If the values work, feel free to submit a PR and add them to the table above.
- `base_url`: The URL where the ILIAS instance is located. (Required)
- `client_id`: An ID used for authentication. (Required)
- `target`: The ILIAS element to crawl. (Required)
- `desktop`: Crawl your personal desktop / dashboard
- `<course id>`: Crawl the course with the given id
- `<url>`: Crawl a given element by URL (preferably the permanent URL linked
at the bottom of its ILIAS page).
This also supports the "My Courses" overview page to download *all*
courses. Note that this might produce confusing local directory layouts
and duplication warnings if you are a member of an ILIAS group. The
`desktop` target is generally preferable.
- `auth`: Name of auth section to use for login. (Required)
- `links`: How to represent external links. (Default: `fancy`)
- `ignore`: Don't download links.
- `plaintext`: A text file containing only the URL.
- `fancy`: A HTML file looking like the ILIAS link element.
- `internet-shortcut`: An internet shortcut file (`.url` file).
- `link_redirect_delay`: Time (in seconds) until `fancy` link files will
redirect to the actual URL. Set to a negative value to disable the automatic
redirect. (Default: `-1`)
- `videos`: Whether to download videos. (Default: `no`)
- `forums`: Whether to download forum threads. (Default: `no`)
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
`20.0`)
### The `kit-ilias-web` crawler
This crawler crawls the KIT ILIAS instance.
@ -232,10 +282,10 @@ is stored in the keyring.
### The `pass` authenticator
This authenticator queries the [`pass` password manager][3] for a username and
password. It tries to be mostly compatible with [browserpass][4] and
[passff][5], so see those links for an overview of the format. If PFERD fails
to load your password, you can use the `--explain` flag to see why.
This authenticator queries the [`pass` password manager][pass] for a username
and password. It tries to be mostly compatible with [browserpass][browserpass]
and [passff][passff], so see those links for an overview of the format. If PFERD
fails to load your password, you can use the `--explain` flag to see why.
- `passname`: The name of the password to use (Required)
- `username_prefixes`: A comma-separated list of username line prefixes
@ -243,9 +293,9 @@ to load your password, you can use the `--explain` flag to see why.
- `password_prefixes`: A comma-separated list of password line prefixes
(Default: `password,pass,secret`)
[3]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager"
[4]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store"
[5]: <https://github.com/passff/passff#multi-line-format> "Multi-line format"
[pass]: <https://www.passwordstore.org/> "Pass: The Standard Unix Password Manager"
[browserpass]: <https://github.com/browserpass/browserpass-extension#organizing-password-store> "Organizing password store"
[passff]: <https://github.com/passff/passff#multi-line-format> "Multi-line format"
### The `tfa` authenticator
@ -344,7 +394,8 @@ matches `SOURCE`, the output path is created using `TARGET` as template.
be referred to as `{g<n>}` (e.g. `{g3}`). `{g0}` refers to the original path.
If capturing group *n*'s contents are a valid integer, the integer value is
available as `{i<n>}` (e.g. `{i3}`). If capturing group *n*'s contents are a
valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). If a
valid float, the float value is available as `{f<n>}` (e.g. `{f3}`). Named capture
groups (e.g. `(?P<name>)`) are available by their name (e.g. `{name}`). If a
capturing group is not present (e.g. when matching the string `cd` with the
regex `(ab)?cd`), the corresponding variables are not defined.

View File

@ -1,6 +1,6 @@
Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw,
Copyright 2019-2024 Garmelon, I-Al-Istannen, danstooamerican, pavelzw,
TheChristophe, Scriptim, thelukasprobst, Toorero,
Mr-Pine
Mr-Pine, p-fruck
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in

View File

@ -8,6 +8,7 @@
# well.
from . import command_local # noqa: F401 imported but unused
from . import command_ilias_web # noqa: F401 imported but unused
from . import command_kit_ilias_web # noqa: F401 imported but unused
from . import command_kit_ipd # noqa: F401 imported but unused
from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused

View File

@ -0,0 +1,56 @@
import argparse
import configparser
from ..logging import log
from .common_ilias_args import configure_common_group_args, load_common
from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler
COMMAND_NAME = "ilias-web"
SUBPARSER = SUBPARSERS.add_parser(
COMMAND_NAME,
parents=[CRAWLER_PARSER],
)
GROUP = SUBPARSER.add_argument_group(
title=f"{COMMAND_NAME} crawler arguments",
description=f"arguments for the '{COMMAND_NAME}' crawler",
)
GROUP.add_argument(
"--base-url",
type=str,
metavar="BASE_URL",
help="The base url of the ilias instance"
)
GROUP.add_argument(
"--client-id",
type=str,
metavar="CLIENT_ID",
help="The client id of the ilias instance"
)
configure_common_group_args(GROUP)
def load(
args: argparse.Namespace,
parser: configparser.ConfigParser,
) -> None:
log.explain(f"Creating config for command '{COMMAND_NAME}'")
parser["crawl:ilias"] = {}
section = parser["crawl:ilias"]
load_crawler(args, section)
section["type"] = COMMAND_NAME
if args.ilias_url is not None:
section["base_url"] = args.ilias_url
if args.client_id is not None:
section["client_id"] = args.client_id
load_common(section, args, parser)
SUBPARSER.set_defaults(command=load)

View File

@ -1,120 +1,37 @@
import argparse
import configparser
from pathlib import Path
from ..crawl.ilias.file_templates import Links
from ..logging import log
from .parser import (CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, ParserLoadError, load_crawler,
show_value_error)
from .common_ilias_args import configure_common_group_args, load_common
from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler
COMMAND_NAME = "kit-ilias-web"
SUBPARSER = SUBPARSERS.add_parser(
"kit-ilias-web",
COMMAND_NAME,
parents=[CRAWLER_PARSER],
)
GROUP = SUBPARSER.add_argument_group(
title="kit-ilias-web crawler arguments",
description="arguments for the 'kit-ilias-web' crawler",
)
GROUP.add_argument(
"target",
type=str,
metavar="TARGET",
help="course id, 'desktop', or ILIAS URL to crawl"
)
GROUP.add_argument(
"output",
type=Path,
metavar="OUTPUT",
help="output directory"
)
GROUP.add_argument(
"--username", "-u",
type=str,
metavar="USERNAME",
help="user name for authentication"
)
GROUP.add_argument(
"--keyring",
action=BooleanOptionalAction,
help="use the system keyring to store and retrieve passwords"
)
GROUP.add_argument(
"--credential-file",
type=Path,
metavar="PATH",
help="read username and password from a credential file"
)
GROUP.add_argument(
"--links",
type=show_value_error(Links.from_string),
metavar="OPTION",
help="how to represent external links"
)
GROUP.add_argument(
"--link-redirect-delay",
type=int,
metavar="SECONDS",
help="time before 'fancy' links redirect to to their target (-1 to disable)"
)
GROUP.add_argument(
"--videos",
action=BooleanOptionalAction,
help="crawl and download videos"
)
GROUP.add_argument(
"--forums",
action=BooleanOptionalAction,
help="crawl and download forum posts"
)
GROUP.add_argument(
"--http-timeout", "-t",
type=float,
metavar="SECONDS",
help="timeout for all HTTP requests"
title=f"{COMMAND_NAME} crawler arguments",
description=f"arguments for the '{COMMAND_NAME}' crawler",
)
configure_common_group_args(GROUP)
def load(
args: argparse.Namespace,
parser: configparser.ConfigParser,
) -> None:
log.explain("Creating config for command 'kit-ilias-web'")
log.explain(f"Creating config for command '{COMMAND_NAME}'")
parser["crawl:ilias"] = {}
section = parser["crawl:ilias"]
load_crawler(args, section)
section["type"] = "kit-ilias-web"
section["target"] = str(args.target)
section["output_dir"] = str(args.output)
section["auth"] = "auth:ilias"
if args.links is not None:
section["links"] = str(args.links.value)
if args.link_redirect_delay is not None:
section["link_redirect_delay"] = str(args.link_redirect_delay)
if args.videos is not None:
section["videos"] = "yes" if args.videos else "no"
if args.forums is not None:
section["forums"] = "yes" if args.forums else "no"
if args.http_timeout is not None:
section["http_timeout"] = str(args.http_timeout)
parser["auth:ilias"] = {}
auth_section = parser["auth:ilias"]
if args.credential_file is not None:
if args.username is not None:
raise ParserLoadError("--credential-file and --username can't be used together")
if args.keyring:
raise ParserLoadError("--credential-file and --keyring can't be used together")
auth_section["type"] = "credential-file"
auth_section["path"] = str(args.credential_file)
elif args.keyring:
auth_section["type"] = "keyring"
else:
auth_section["type"] = "simple"
if args.username is not None:
auth_section["username"] = args.username
section["type"] = COMMAND_NAME
load_common(section, args, parser)
SUBPARSER.set_defaults(command=load)

View File

@ -0,0 +1,104 @@
import argparse
import configparser
from pathlib import Path
from ..crawl.ilias.file_templates import Links
from .parser import BooleanOptionalAction, ParserLoadError, show_value_error
def configure_common_group_args(group: argparse._ArgumentGroup) -> None:
"""These arguments are shared between the KIT and generic Ilias web command."""
group.add_argument(
"target",
type=str,
metavar="TARGET",
help="course id, 'desktop', or ILIAS URL to crawl"
)
group.add_argument(
"output",
type=Path,
metavar="OUTPUT",
help="output directory"
)
group.add_argument(
"--username", "-u",
type=str,
metavar="USERNAME",
help="user name for authentication"
)
group.add_argument(
"--keyring",
action=BooleanOptionalAction,
help="use the system keyring to store and retrieve passwords"
)
group.add_argument(
"--credential-file",
type=Path,
metavar="PATH",
help="read username and password from a credential file"
)
group.add_argument(
"--links",
type=show_value_error(Links.from_string),
metavar="OPTION",
help="how to represent external links"
)
group.add_argument(
"--link-redirect-delay",
type=int,
metavar="SECONDS",
help="time before 'fancy' links redirect to to their target (-1 to disable)"
)
group.add_argument(
"--videos",
action=BooleanOptionalAction,
help="crawl and download videos"
)
group.add_argument(
"--forums",
action=BooleanOptionalAction,
help="crawl and download forum posts"
)
group.add_argument(
"--http-timeout", "-t",
type=float,
metavar="SECONDS",
help="timeout for all HTTP requests"
)
def load_common(
section: configparser.SectionProxy,
args: argparse.Namespace,
parser: configparser.ConfigParser,
) -> None:
"""Load common config between generic and KIT ilias web command"""
section["target"] = str(args.target)
section["output_dir"] = str(args.output)
section["auth"] = "auth:ilias"
if args.links is not None:
section["links"] = str(args.links.value)
if args.link_redirect_delay is not None:
section["link_redirect_delay"] = str(args.link_redirect_delay)
if args.videos is not None:
section["videos"] = "yes" if args.videos else "no"
if args.forums is not None:
section["forums"] = "yes" if args.forums else "no"
if args.http_timeout is not None:
section["http_timeout"] = str(args.http_timeout)
parser["auth:ilias"] = {}
auth_section = parser["auth:ilias"]
if args.credential_file is not None:
if args.username is not None:
raise ParserLoadError("--credential-file and --username can't be used together")
if args.keyring:
raise ParserLoadError("--credential-file and --keyring can't be used together")
auth_section["type"] = "credential-file"
auth_section["path"] = str(args.credential_file)
elif args.keyring:
auth_section["type"] = "keyring"
else:
auth_section["type"] = "simple"
if args.username is not None:
auth_section["username"] = args.username

View File

@ -4,7 +4,7 @@ from typing import Callable, Dict
from ..auth import Authenticator
from ..config import Config
from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401
from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection
from .ilias import IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler, KitIliasWebCrawlerSection
from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection
from .local_crawler import LocalCrawler, LocalCrawlerSection
@ -18,6 +18,8 @@ CrawlerConstructor = Callable[[
CRAWLERS: Dict[str, CrawlerConstructor] = {
"local": lambda n, s, c, a:
LocalCrawler(n, LocalCrawlerSection(s), c),
"ilias-web": lambda n, s, c, a:
IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
"kit-ilias-web": lambda n, s, c, a:
KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
"kit-ipd": lambda n, s, c, a:

View File

@ -1,3 +1,9 @@
from .kit_ilias_web_crawler import KitIliasWebCrawler, KitIliasWebCrawlerSection
from .kit_ilias_web_crawler import (IliasWebCrawler, IliasWebCrawlerSection, KitIliasWebCrawler,
KitIliasWebCrawlerSection)
__all__ = ["KitIliasWebCrawler", "KitIliasWebCrawlerSection"]
__all__ = [
"IliasWebCrawler",
"IliasWebCrawlerSection",
"KitIliasWebCrawler",
"KitIliasWebCrawlerSection",
]

View File

@ -0,0 +1,39 @@
import asyncio
from typing import Any, Callable, Optional
import aiohttp
from ...logging import log
from ..crawler import AWrapped, CrawlError, CrawlWarning
def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]:
def decorator(f: AWrapped) -> AWrapped:
async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
last_exception: Optional[BaseException] = None
for round in range(attempts):
try:
return await f(*args, **kwargs)
except aiohttp.ContentTypeError: # invalid content type
raise CrawlWarning("ILIAS returned an invalid content type")
except aiohttp.TooManyRedirects:
raise CrawlWarning("Got stuck in a redirect loop")
except aiohttp.ClientPayloadError as e: # encoding or not enough bytes
last_exception = e
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
last_exception = e
except asyncio.exceptions.TimeoutError as e: # explicit http timeouts in HttpCrawler
last_exception = e
log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}")
if last_exception:
message = f"Error in I/O Operation: {last_exception}"
if failure_is_error:
raise CrawlError(message) from last_exception
else:
raise CrawlWarning(message) from last_exception
raise CrawlError("Impossible return in ilias _iorepeat")
return wrapper # type: ignore
return decorator

File diff suppressed because it is too large Load Diff

View File

@ -17,7 +17,7 @@ TargetType = Union[str, int]
class IliasElementType(Enum):
EXERCISE = "exercise"
EXERCISE_FILES = "exercise_files" # own submitted files
TEST = "test" # an online test. Will be ignored currently.
TEST = "test" # an online test. Will be ignored currently.
FILE = "file"
FOLDER = "folder"
FORUM = "forum"
@ -48,6 +48,10 @@ class IliasPageElement:
regexes = [
r"eid=(?P<id>[0-9a-z\-]+)",
r"file_(?P<id>\d+)",
r"copa_(?P<id>\d+)",
r"fold_(?P<id>\d+)",
r"frm_(?P<id>\d+)",
r"exc_(?P<id>\d+)",
r"ref_id=(?P<id>\d+)",
r"target=[a-z]+_(?P<id>\d+)",
r"mm_(?P<id>\d+)"
@ -61,6 +65,52 @@ class IliasPageElement:
log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.")
return self.url
@staticmethod
def create_new(
typ: IliasElementType,
url: str,
name: str,
mtime: Optional[datetime] = None,
description: Optional[str] = None,
skip_sanitize: bool = False
) -> 'IliasPageElement':
if typ == IliasElementType.MEETING:
normalized = IliasPageElement._normalize_meeting_name(name)
log.explain(f"Normalized meeting name from {name!r} to {normalized!r}")
name = normalized
if not skip_sanitize:
name = _sanitize_path_name(name)
return IliasPageElement(typ, url, name, mtime, description)
@staticmethod
def _normalize_meeting_name(meeting_name: str) -> str:
"""
Normalizes meeting names, which have a relative time as their first part,
to their date in ISO format.
"""
# This checks whether we can reach a `:` without passing a `-`
if re.search(r"^[^-]+: ", meeting_name):
# Meeting name only contains date: "05. Jan 2000:"
split_delimiter = ":"
else:
# Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:"
split_delimiter = ", "
# We have a meeting day without time
date_portion_str = meeting_name.split(split_delimiter)[0]
date_portion = demangle_date(date_portion_str)
# We failed to parse the date, bail out
if not date_portion:
return meeting_name
# Replace the first section with the absolute date
rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:])
return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name
@dataclass
class IliasDownloadForumData:
@ -95,13 +145,9 @@ class IliasPage:
@staticmethod
def is_root_page(soup: BeautifulSoup) -> bool:
permalink = soup.find(id="current_perma_link")
if permalink is None:
return False
value = permalink.attrs.get("value")
if value is None:
return False
return "goto.php?target=root_" in value
if permalink := IliasPage.get_soup_permalink(soup):
return "goto.php?target=root_" in permalink
return False
def get_child_elements(self) -> List[IliasPageElement]:
"""
@ -134,7 +180,7 @@ class IliasPage:
attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x}
)
if tab is not None:
return IliasPageElement(
return IliasPageElement.create_new(
IliasElementType.INFO_TAB,
self._abs_url_from_link(tab),
"infos"
@ -279,16 +325,14 @@ class IliasPage:
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
def _is_content_page(self) -> bool:
link = self._soup.find(id="current_perma_link")
if not link:
return False
return "target=copa_" in link.get("value")
if link := self.get_permalink():
return "target=copa_" in link
return False
def _is_learning_module_page(self) -> bool:
link = self._soup.find(id="current_perma_link")
if not link:
return False
return "target=pg_" in link.get("value")
if link := self.get_permalink():
return "target=pg_" in link
return False
def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None
@ -301,7 +345,7 @@ class IliasPage:
if not element:
return None
link = self._abs_url_from_link(element)
return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings")
def _is_content_tab_selected(self) -> bool:
return self._select_content_page_url() is None
@ -310,6 +354,9 @@ class IliasPage:
might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None
return self._page_type == IliasElementType.INFO_TAB and might_be_info
def _is_course_overview_page(self) -> bool:
return "baseClass=ilmembershipoverviewgui" in self._page_url
def _select_content_page_url(self) -> Optional[IliasPageElement]:
tab = self._soup.find(
id="tab_view_content",
@ -321,7 +368,7 @@ class IliasPage:
link = tab.find("a")
if link:
link = self._abs_url_from_link(link)
return IliasPageElement(IliasElementType.FOLDER, link, "select content page")
return IliasPageElement.create_new(IliasElementType.FOLDER, link, "select content page")
_unexpected_html_warning()
log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.")
@ -351,14 +398,16 @@ class IliasPage:
# and just fetch the lone video url!
if len(streams) == 1:
video_url = streams[0]["sources"]["mp4"][0]["src"]
return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)]
return [
IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)
]
log.explain(f"Found multiple videos for stream at {self._source_name}")
items = []
for stream in sorted(streams, key=lambda stream: stream["content"]):
full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4"
video_url = stream["sources"]["mp4"][0]["src"]
items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name))
items.append(IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO, video_url, full_name))
return items
@ -373,7 +422,7 @@ class IliasPage:
link = self._abs_url_from_link(correct_link)
return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads")
return IliasPageElement.create_new(IliasElementType.FORUM, link, "show all forum threads")
def _find_personal_desktop_entries(self) -> List[IliasPageElement]:
items: List[IliasPageElement] = []
@ -384,6 +433,10 @@ class IliasPage:
name = _sanitize_path_name(link.text.strip())
url = self._abs_url_from_link(link)
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
# Configure button/link does not have anything interesting
continue
type = self._find_type_from_link(name, link, url)
if not type:
_unexpected_html_warning()
@ -396,7 +449,7 @@ class IliasPage:
url = re.sub(r"(target=file_\d+)", r"\1_download", url)
log.explain("Rewired file URL to include download part")
items.append(IliasPageElement(type, url, name))
items.append(IliasPageElement.create_new(type, url, name))
return items
@ -414,7 +467,7 @@ class IliasPage:
log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}")
continue
items.append(IliasPageElement(IliasElementType.FILE, url, name))
items.append(IliasPageElement.create_new(IliasElementType.FILE, url, name))
return items
@ -427,7 +480,7 @@ class IliasPage:
continue
if "cmd=sendfile" not in link["href"]:
continue
items.append(IliasPageElement(
items.append(IliasPageElement.create_new(
IliasElementType.FILE,
self._abs_url_from_link(link),
_sanitize_path_name(link.getText())
@ -455,7 +508,9 @@ class IliasPage:
query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
url = url_set_query_params(url, query_params)
log.explain("Found ILIAS video frame page, fetching actual content next")
return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")]
return [
IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")
]
is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
@ -484,7 +539,7 @@ class IliasPage:
url = url_set_query_params(self._page_url, query_params)
log.explain("Disabled pagination, retrying folder as a new entry")
return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")]
return [IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")]
def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]:
"""
@ -513,8 +568,8 @@ class IliasPage:
modification_string = link.parent.parent.parent.select_one(
f"td.std:nth-child({index})"
).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
break
if modification_time is None:
@ -529,7 +584,7 @@ class IliasPage:
video_url = self._abs_url_from_link(link)
log.explain(f"Found video {video_name!r} at {video_url}")
return IliasPageElement(
return IliasPageElement.create_new(
IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time
)
@ -565,7 +620,7 @@ class IliasPage:
if date is None:
log.warn(f"Date parsing failed for exercise entry {name!r}")
results.append(IliasPageElement(
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
self._abs_url_from_link(link),
name,
@ -598,22 +653,22 @@ class IliasPage:
# Two divs, side by side. Left is the name, right is the link ==> get left
# sibling
file_name = file_link.parent.findPrevious(name="div").getText().strip()
file_name = _sanitize_path_name(file_name)
url = self._abs_url_from_link(file_link)
log.explain(f"Found exercise entry {file_name!r}")
results.append(IliasPageElement(
results.append(IliasPageElement.create_new(
IliasElementType.FILE,
url,
container_name + "/" + file_name,
None # We do not have any timestamp
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
mtime=None, # We do not have any timestamp
skip_sanitize=True
))
# Find all links to file listings (e.g. "Submitted Files" for groups)
file_listings: List[Tag] = container.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
)
# Add each listing as a new
@ -624,14 +679,15 @@ class IliasPage:
label_container: Tag = parent_container.find(
attrs={"class": lambda x: x and "control-label" in x}
)
file_name = _sanitize_path_name(label_container.getText().strip())
file_name = label_container.getText().strip()
url = self._abs_url_from_link(listing)
log.explain(f"Found exercise detail {file_name!r} at {url}")
results.append(IliasPageElement(
results.append(IliasPageElement.create_new(
IliasElementType.EXERCISE_FILES,
url,
container_name + "/" + file_name,
None # we do not have any timestamp
_sanitize_path_name(container_name) + "/" + _sanitize_path_name(file_name),
None, # we do not have any timestamp
skip_sanitize=True
))
return results
@ -639,12 +695,18 @@ class IliasPage:
def _find_normal_entries(self) -> List[IliasPageElement]:
result: List[IliasPageElement] = []
links: List[Tag] = []
# Fetch all links and throw them to the general interpreter
links: List[Tag] = self._soup.select("a.il_ContainerItemTitle")
if self._is_course_overview_page():
log.explain("Page is a course overview page, adjusting link selector")
links.extend(self._soup.select(".il-item-title > a"))
else:
links.extend(self._soup.select("a.il_ContainerItemTitle"))
for link in links:
abs_url = self._abs_url_from_link(link)
parents = self._find_upwards_folder_hierarchy(link)
# Make sure parents are sanitized. We do not want accidental parents
parents = [_sanitize_path_name(x) for x in self._find_upwards_folder_hierarchy(link)]
if parents:
element_name = "/".join(parents) + "/" + _sanitize_path_name(link.getText())
@ -662,16 +724,18 @@ class IliasPage:
if not element_type:
continue
if element_type == IliasElementType.MEETING:
normalized = _sanitize_path_name(self._normalize_meeting_name(element_name))
log.explain(f"Normalized meeting name from {element_name!r} to {normalized!r}")
element_name = normalized
elif element_type == IliasElementType.FILE:
result.append(self._file_to_element(element_name, abs_url, link))
continue
log.explain(f"Found {element_name!r}")
result.append(IliasPageElement(element_type, abs_url, element_name, description=description))
result.append(IliasPageElement.create_new(
element_type,
abs_url,
element_name,
description=description,
skip_sanitize=True
))
result += self._find_cards()
result += self._find_mediacast_videos()
@ -694,8 +758,8 @@ class IliasPage:
log.warn_contd(f"No <video> element found for mediacast video '{element_name}'")
continue
videos.append(IliasPageElement(
type=IliasElementType.MEDIACAST_VIDEO,
videos.append(IliasPageElement.create_new(
typ=IliasElementType.MEDIACAST_VIDEO,
url=self._abs_url_from_relative(video_element.get("src")),
name=element_name,
mtime=self._find_mediacast_video_mtime(elem.findParent(name="td"))
@ -753,11 +817,14 @@ class IliasPage:
# ILIAS has proper accordions and weird blocks that look like normal headings,
# but some JS later transforms them into an accordion.
# This is for these weird JS-y blocks
# This is for these weird JS-y blocks and custom item groups
if "ilContainerItemsContainer" in parent.get("class"):
data_store_url = parent.parent.get("data-store-url", "").lower()
is_custom_item_group = "baseclass=ilcontainerblockpropertiesstoragegui" in data_store_url \
and "cont_block_id=" in data_store_url
# I am currently under the impression that *only* those JS blocks have an
# ilNoDisplay class.
if "ilNoDisplay" not in parent.get("class"):
if not is_custom_item_group and "ilNoDisplay" not in parent.get("class"):
continue
prev: Tag = parent.findPreviousSibling("div")
if "ilContainerBlockHeader" in prev.get("class"):
@ -817,7 +884,9 @@ class IliasPage:
full_path = name + "." + file_type
log.explain(f"Found file {full_path!r}")
return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date)
return IliasPageElement.create_new(
IliasElementType.FILE, url, full_path, modification_date, skip_sanitize=True
)
def _find_cards(self) -> List[IliasPageElement]:
result: List[IliasPageElement] = []
@ -834,7 +903,7 @@ class IliasPage:
log.warn_contd(f"Could not extract type for {title}")
continue
result.append(IliasPageElement(type, url, name))
result.append(IliasPageElement.create_new(type, url, name))
card_button_tiles: List[Tag] = self._soup.select(".card-title button")
@ -863,7 +932,7 @@ class IliasPage:
log.warn_contd(f"Could not extract type for {button}")
continue
result.append(IliasPageElement(type, url, name, description=description))
result.append(IliasPageElement.create_new(type, url, name, description=description))
return result
@ -917,9 +986,9 @@ class IliasPage:
@staticmethod
def _find_type_from_link(
element_name: str,
link_element: Tag,
url: str
element_name: str,
link_element: Tag,
url: str
) -> Optional[IliasElementType]:
"""
Decides which sub crawler to use for a given top level element.
@ -957,6 +1026,19 @@ class IliasPage:
if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
return IliasElementType.SCORM_LEARNING_MODULE
# other universities might have content type specified in URL path
if "_file_" in parsed_url.path:
return IliasElementType.FILE
if "_fold_" in parsed_url.path or "_copa_" in parsed_url.path:
return IliasElementType.FOLDER
if "_frm_" in parsed_url.path:
return IliasElementType.FORUM
if "_exc_" in parsed_url.path:
return IliasElementType.EXERCISE
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
# try to guess it from the image.
@ -1040,33 +1122,6 @@ class IliasPage:
return IliasElementType.FOLDER
@staticmethod
def _normalize_meeting_name(meeting_name: str) -> str:
"""
Normalizes meeting names, which have a relative time as their first part,
to their date in ISO format.
"""
# This checks whether we can reach a `:` without passing a `-`
if re.search(r"^[^-]+: ", meeting_name):
# Meeting name only contains date: "05. Jan 2000:"
split_delimiter = ":"
else:
# Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:"
split_delimiter = ", "
# We have a meeting day without time
date_portion_str = meeting_name.split(split_delimiter)[0]
date_portion = demangle_date(date_portion_str)
# We failed to parse the date, bail out
if not date_portion:
return meeting_name
# Replace the first section with the absolute date
rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:])
return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name
@staticmethod
def is_logged_in(soup: BeautifulSoup) -> bool:
# Normal ILIAS pages
@ -1080,6 +1135,14 @@ class IliasPage:
if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
return True
# Empty personal desktop has zero (0) markers. Match on the text...
if alert := soup.select_one(".alert-info"):
text = alert.getText().lower()
if "you have not yet selected any favourites" in text:
return True
if "sie haben aktuell noch keine favoriten ausgewählt" in text:
return True
# Video listing embeds do not have complete ILIAS html. Try to match them by
# their video listing table
video_table = soup.find(
@ -1095,6 +1158,9 @@ class IliasPage:
return True
return False
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
def _abs_url_from_link(self, link_tag: Tag) -> str:
"""
Create an absolute url from an <a> tag.
@ -1107,6 +1173,13 @@ class IliasPage:
"""
return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
if not perma_link_element or not perma_link_element.get("href"):
return None
return perma_link_element.get("href")
def _unexpected_html_warning() -> None:
log.warn("Encountered unexpected HTML structure, ignoring element.")
@ -1130,7 +1203,7 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
date_str = date_str.strip()
for german, english in zip(german_months, english_months):
date_str = date_str.replace(german, english)

File diff suppressed because it is too large Load Diff

View File

@ -110,6 +110,10 @@ class ExactReTf(Transformation):
except ValueError:
pass
named_groups: Dict[str, str] = match.groupdict()
for name, capture in named_groups.items():
locals_dir[name] = capture
result = eval(f"f{right!r}", {}, locals_dir)
return Transformed(PurePath(result))

View File

@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.5.0"
VERSION = "3.6.0"

View File

@ -56,6 +56,17 @@ Also, you can download most ILIAS pages directly like this:
$ pferd kit-ilias-web <url> <output_directory>
```
PFERD supports other ILIAS instances as well, using the `ilias-web` crawler (see
the [config section on `ilias-web`](CONFIG.md#the-ilias-web-crawler) for more
detail on the `base-url` and `client-id` parameters):
```
$ pferd ilias-web \
--base-url https://ilias.my-university.example \
--client-id My_University desktop \
<output_directory>
```
However, the CLI only lets you download a single thing at a time, and the
resulting command can grow long quite quickly. Because of this, PFERD can also
be used with a config file.

8
flake.lock generated
View File

@ -2,16 +2,16 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1694499547,
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
"lastModified": 1708979614,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-23.05",
"ref": "nixos-23.11",
"repo": "nixpkgs",
"type": "github"
}

View File

@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
};
outputs = { self, nixpkgs }:

View File

@ -1,11 +0,0 @@
[mypy]
disallow_any_generics = True
disallow_untyped_defs = True
disallow_incomplete_defs = True
no_implicit_optional = True
warn_unused_ignores = True
warn_unreachable = True
show_error_context = True
[mypy-rich.*,bs4,keyring]
ignore_missing_imports = True

View File

@ -1,3 +1,42 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "PFERD"
dependencies = [
"aiohttp>=3.8.1",
"beautifulsoup4>=4.10.0",
"rich>=11.0.0",
"keyring>=23.5.0",
"certifi>=2021.10.8"
]
dynamic = ["version"]
requires-python = ">=3.9"
[project.scripts]
pferd = "PFERD.__main__:main"
[tool.setuptools.dynamic]
version = {attr = "PFERD.version.VERSION"}
[tool.flake8]
max-line-length = 110
[tool.isort]
line_length = 110
[tool.autopep8]
max_line_length = 110
in-place = true
recursive = true
[tool.mypy]
disallow_any_generics = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
no_implicit_optional = true
warn_unused_ignores = true
warn_unreachable = true
show_error_context = true
ignore_missing_imports = true

View File

@ -1,8 +1,8 @@
#!/usr/bin/env python3
import argparse
import time
import re
import time
from subprocess import run

View File

@ -2,5 +2,5 @@
set -e
mypy PFERD
mypy .
flake8 PFERD

View File

@ -2,5 +2,5 @@
set -e
autopep8 --recursive --in-place PFERD
isort PFERD
autopep8 .
isort .

View File

@ -13,5 +13,5 @@ pip install --upgrade setuptools
pip install --editable .
# Installing tools and type hints
pip install --upgrade mypy flake8 autopep8 isort pyinstaller
pip install --upgrade mypy flake8 flake8-pyproject autopep8 isort pyinstaller
pip install --upgrade types-chardet types-certifi

View File

@ -1,23 +0,0 @@
[metadata]
name = PFERD
version = attr: PFERD.version.VERSION
[options]
packages = find:
python_requires = >=3.9
install_requires =
aiohttp>=3.8.1
beautifulsoup4>=4.10.0
rich>=11.0.0
keyring>=23.5.0
certifi>=2021.10.8
[options.entry_points]
console_scripts =
pferd = PFERD.__main__:main
[flake8]
max_line_length = 110
[isort]
line_length = 110