mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Compare commits
1 Commits
29251fa003
...
update-che
Author | SHA1 | Date | |
---|---|---|---|
2d145e7c94 |
8
.github/workflows/build-and-release.yml
vendored
8
.github/workflows/build-and-release.yml
vendored
@ -17,9 +17,9 @@ jobs:
|
|||||||
python: ["3.9"]
|
python: ["3.9"]
|
||||||
steps:
|
steps:
|
||||||
|
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python }}
|
python-version: ${{ matrix.python }}
|
||||||
|
|
||||||
@ -45,7 +45,7 @@ jobs:
|
|||||||
run: mv dist/pferd* dist/pferd-${{ matrix.os }}
|
run: mv dist/pferd* dist/pferd-${{ matrix.os }}
|
||||||
|
|
||||||
- name: Upload binary
|
- name: Upload binary
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: Binaries
|
name: Binaries
|
||||||
path: dist/pferd-${{ matrix.os }}
|
path: dist/pferd-${{ matrix.os }}
|
||||||
@ -57,7 +57,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
|
|
||||||
- name: Download binaries
|
- name: Download binaries
|
||||||
uses: actions/download-artifact@v3
|
uses: actions/download-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: Binaries
|
name: Binaries
|
||||||
|
|
||||||
|
36
CHANGELOG.md
36
CHANGELOG.md
@ -23,42 +23,8 @@ ambiguous situations.
|
|||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Crawling of courses with the timeline view as the default tab
|
- Forum crawling crashing when parsing empty (= 0 messages) threads
|
||||||
- Crawling of file and custom opencast cards
|
|
||||||
- Crawling of button cards without descriptions
|
|
||||||
- Abort crawling when encountering an unexpected ilias root page redirect
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- `no-delete-prompt-override` conflict resolution strategy
|
|
||||||
- support for ILIAS learning modules
|
|
||||||
- `show_not_deleted` option to stop printing the "Not Deleted" status or report
|
|
||||||
message. This combines nicely with the `no-delete-prompt-override` strategy,
|
|
||||||
causing PFERD to mostly ignore local-only files.
|
|
||||||
|
|
||||||
## 3.4.3 - 2022-11-29
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Missing documentation for `forums` option
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- Clear up error message shown when multiple paths are found to an element
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- IPD crawler unnecessarily appending trailing slashes
|
|
||||||
- Crawling opencast when ILIAS is set to English
|
|
||||||
|
|
||||||
## 3.4.2 - 2022-10-26
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- Recognize and crawl content pages in cards
|
|
||||||
- Recognize and ignore surveys
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- Forum crawling crashing when a thread has no messages at all
|
|
||||||
- Forum crawling crashing when a forum has no threads at all
|
- Forum crawling crashing when a forum has no threads at all
|
||||||
- Ilias login failing in some cases
|
|
||||||
- Crawling of paginated future meetings
|
|
||||||
- IPD crawler handling of URLs without trailing slash
|
|
||||||
|
|
||||||
## 3.4.1 - 2022-08-17
|
## 3.4.1 - 2022-08-17
|
||||||
|
|
||||||
|
13
CONFIG.md
13
CONFIG.md
@ -26,9 +26,6 @@ default values for the other sections.
|
|||||||
`Added ...`) while running a crawler. (Default: `yes`)
|
`Added ...`) while running a crawler. (Default: `yes`)
|
||||||
- `report`: Whether PFERD should print a report of added, changed and deleted
|
- `report`: Whether PFERD should print a report of added, changed and deleted
|
||||||
local files for all crawlers before exiting. (Default: `yes`)
|
local files for all crawlers before exiting. (Default: `yes`)
|
||||||
- `show_not_deleted`: Whether PFERD should print messages in status and report
|
|
||||||
when a local-only file wasn't deleted. Combines nicely with the
|
|
||||||
`no-delete-prompt-override` conflict resolution strategy.
|
|
||||||
- `share_cookies`: Whether crawlers should share cookies where applicable. For
|
- `share_cookies`: Whether crawlers should share cookies where applicable. For
|
||||||
example, some crawlers share cookies if they crawl the same website using the
|
example, some crawlers share cookies if they crawl the same website using the
|
||||||
same account. (Default: `yes`)
|
same account. (Default: `yes`)
|
||||||
@ -78,9 +75,6 @@ common to all crawlers:
|
|||||||
using `prompt` and always choosing "yes".
|
using `prompt` and always choosing "yes".
|
||||||
- `no-delete`: Never delete local files, but overwrite local files if the
|
- `no-delete`: Never delete local files, but overwrite local files if the
|
||||||
remote file is different.
|
remote file is different.
|
||||||
- `no-delete-prompt-overwrite`: Never delete local files, but prompt to
|
|
||||||
overwrite local files if the remote file is different. Combines nicely
|
|
||||||
with the `show_not_deleted` option.
|
|
||||||
- `transform`: Rules for renaming and excluding certain files and directories.
|
- `transform`: Rules for renaming and excluding certain files and directories.
|
||||||
For more details, see [this section](#transformation-rules). (Default: empty)
|
For more details, see [this section](#transformation-rules). (Default: empty)
|
||||||
- `tasks`: The maximum number of concurrent tasks (such as crawling or
|
- `tasks`: The maximum number of concurrent tasks (such as crawling or
|
||||||
@ -92,9 +86,6 @@ common to all crawlers:
|
|||||||
load for the crawl target. (Default: `0.0`)
|
load for the crawl target. (Default: `0.0`)
|
||||||
- `windows_paths`: Whether PFERD should find alternative names for paths that
|
- `windows_paths`: Whether PFERD should find alternative names for paths that
|
||||||
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
|
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
|
||||||
- `aliases`: List of strings that are considered as an alias when invoking with
|
|
||||||
the `--crawler` or `-C` flag. If there is more than one crawl section with
|
|
||||||
the same aliases all are selected. Thereby, you can group different crawlers.
|
|
||||||
|
|
||||||
Some crawlers may also require credentials for authentication. To configure how
|
Some crawlers may also require credentials for authentication. To configure how
|
||||||
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
||||||
@ -109,7 +100,6 @@ username = foo
|
|||||||
password = bar
|
password = bar
|
||||||
|
|
||||||
[crawl:something]
|
[crawl:something]
|
||||||
aliases = [sth, some]
|
|
||||||
type = some-complex-crawler
|
type = some-complex-crawler
|
||||||
auth = auth:example
|
auth = auth:example
|
||||||
on_conflict = no-delete
|
on_conflict = no-delete
|
||||||
@ -191,7 +181,6 @@ script once per day should be fine.
|
|||||||
redirect to the actual URL. Set to a negative value to disable the automatic
|
redirect to the actual URL. Set to a negative value to disable the automatic
|
||||||
redirect. (Default: `-1`)
|
redirect. (Default: `-1`)
|
||||||
- `videos`: Whether to download videos. (Default: `no`)
|
- `videos`: Whether to download videos. (Default: `no`)
|
||||||
- `forums`: Whether to download forum threads. (Default: `no`)
|
|
||||||
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
||||||
`20.0`)
|
`20.0`)
|
||||||
|
|
||||||
@ -300,7 +289,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
|
|||||||
Example: `foo/bar --> baz`
|
Example: `foo/bar --> baz`
|
||||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||||
- Converts `foo/bar` into `baz`
|
- Converts `foo/bar` into `baz`
|
||||||
- Converts `foo/bar/wargl` into `baz/wargl`
|
- Converts `foo/bar/wargl` into `bar/wargl`
|
||||||
|
|
||||||
Example: `foo/bar --> !`
|
Example: `foo/bar --> !`
|
||||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||||
|
3
LICENSE
3
LICENSE
@ -1,6 +1,5 @@
|
|||||||
Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw,
|
Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw,
|
||||||
TheChristophe, Scriptim, thelukasprobst, Toorero,
|
TheChristophe, Scriptim, thelukasprobst, Toorero
|
||||||
Mr-Pine
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
this software and associated documentation files (the "Software"), to deal in
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
@ -5,6 +5,8 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PFERD.update import check_for_updates
|
||||||
|
|
||||||
from .auth import AuthLoadError
|
from .auth import AuthLoadError
|
||||||
from .cli import PARSER, ParserLoadError, load_default_section
|
from .cli import PARSER, ParserLoadError, load_default_section
|
||||||
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
|
||||||
@ -47,8 +49,6 @@ def configure_logging_from_args(args: argparse.Namespace) -> None:
|
|||||||
log.output_explain = args.explain
|
log.output_explain = args.explain
|
||||||
if args.status is not None:
|
if args.status is not None:
|
||||||
log.output_status = args.status
|
log.output_status = args.status
|
||||||
if args.show_not_deleted is not None:
|
|
||||||
log.output_not_deleted = args.show_not_deleted
|
|
||||||
if args.report is not None:
|
if args.report is not None:
|
||||||
log.output_report = args.report
|
log.output_report = args.report
|
||||||
|
|
||||||
@ -74,8 +74,6 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N
|
|||||||
log.output_status = config.default_section.status()
|
log.output_status = config.default_section.status()
|
||||||
if args.report is None:
|
if args.report is None:
|
||||||
log.output_report = config.default_section.report()
|
log.output_report = config.default_section.report()
|
||||||
if args.show_not_deleted is None:
|
|
||||||
log.output_not_deleted = config.default_section.show_not_deleted()
|
|
||||||
except ConfigOptionError as e:
|
except ConfigOptionError as e:
|
||||||
log.error(str(e))
|
log.error(str(e))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
@ -138,6 +136,11 @@ def main() -> None:
|
|||||||
loop.run_until_complete(asyncio.sleep(1))
|
loop.run_until_complete(asyncio.sleep(1))
|
||||||
loop.close()
|
loop.close()
|
||||||
else:
|
else:
|
||||||
|
log.explain_topic("Checking for updates")
|
||||||
|
if not args.skip_update_check:
|
||||||
|
asyncio.run(check_for_updates())
|
||||||
|
else:
|
||||||
|
log.explain("Update check skipped due to configuration option")
|
||||||
asyncio.run(pferd.run(args.debug_transforms))
|
asyncio.run(pferd.run(args.debug_transforms))
|
||||||
except (ConfigOptionError, AuthLoadError) as e:
|
except (ConfigOptionError, AuthLoadError) as e:
|
||||||
log.unlock()
|
log.unlock()
|
||||||
|
@ -151,6 +151,11 @@ PARSER.add_argument(
|
|||||||
action="version",
|
action="version",
|
||||||
version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
|
version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
|
||||||
)
|
)
|
||||||
|
PARSER.add_argument(
|
||||||
|
"--skip-update-check",
|
||||||
|
action="store_true",
|
||||||
|
help="disable automatic update checks at startup"
|
||||||
|
)
|
||||||
PARSER.add_argument(
|
PARSER.add_argument(
|
||||||
"--config", "-c",
|
"--config", "-c",
|
||||||
type=Path,
|
type=Path,
|
||||||
@ -215,11 +220,6 @@ PARSER.add_argument(
|
|||||||
action=BooleanOptionalAction,
|
action=BooleanOptionalAction,
|
||||||
help="whether crawlers should share cookies where applicable"
|
help="whether crawlers should share cookies where applicable"
|
||||||
)
|
)
|
||||||
PARSER.add_argument(
|
|
||||||
"--show-not-deleted",
|
|
||||||
action=BooleanOptionalAction,
|
|
||||||
help="print messages in status and report when PFERD did not delete a local only file"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_default_section(
|
def load_default_section(
|
||||||
@ -238,7 +238,6 @@ def load_default_section(
|
|||||||
section["report"] = "yes" if args.report else "no"
|
section["report"] = "yes" if args.report else "no"
|
||||||
if args.share_cookies is not None:
|
if args.share_cookies is not None:
|
||||||
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
section["share_cookies"] = "yes" if args.share_cookies else "no"
|
||||||
if args.show_not_deleted is not None:
|
|
||||||
section["show_not_deleted"] = "yes" if args.show_not_deleted else "no"
|
|
||||||
|
|
||||||
SUBPARSERS = PARSER.add_subparsers(title="crawlers")
|
SUBPARSERS = PARSER.add_subparsers(title="crawlers")
|
||||||
|
@ -82,9 +82,6 @@ class DefaultSection(Section):
|
|||||||
def report(self) -> bool:
|
def report(self) -> bool:
|
||||||
return self.s.getboolean("report", fallback=True)
|
return self.s.getboolean("report", fallback=True)
|
||||||
|
|
||||||
def show_not_deleted(self) -> bool:
|
|
||||||
return self.s.getboolean("show_not_deleted", fallback=True)
|
|
||||||
|
|
||||||
def share_cookies(self) -> bool:
|
def share_cookies(self) -> bool:
|
||||||
return self.s.getboolean("share_cookies", fallback=True)
|
return self.s.getboolean("share_cookies", fallback=True)
|
||||||
|
|
||||||
|
@ -1,10 +1,6 @@
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import bs4
|
|
||||||
|
|
||||||
from PFERD.utils import soupify
|
|
||||||
|
|
||||||
_link_template_plain = "{{link}}"
|
_link_template_plain = "{{link}}"
|
||||||
_link_template_fancy = """
|
_link_template_fancy = """
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
@ -98,71 +94,6 @@ _link_template_internet_shortcut = """
|
|||||||
URL={{link}}
|
URL={{link}}
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
_learning_module_template = """
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<title>{{name}}</title>
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<style>
|
|
||||||
* {
|
|
||||||
box-sizing: border-box;
|
|
||||||
}
|
|
||||||
.center-flex {
|
|
||||||
display: flex;
|
|
||||||
align-items: center;
|
|
||||||
justify-content: center;
|
|
||||||
}
|
|
||||||
.nav {
|
|
||||||
display: flex;
|
|
||||||
justify-content: space-between;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<body class="center-flex">
|
|
||||||
{{body}}
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str:
|
|
||||||
# Seems to be comments, ignore those.
|
|
||||||
for elem in body.select(".il-copg-mob-fullscreen-modal"):
|
|
||||||
elem.decompose()
|
|
||||||
|
|
||||||
nav_template = """
|
|
||||||
<div class="nav">
|
|
||||||
{{left}}
|
|
||||||
{{right}}
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
if prev and body.select_one(".ilc_page_lnav_LeftNavigation"):
|
|
||||||
text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip()
|
|
||||||
left = f'<a href="{prev}">{text}</a>'
|
|
||||||
else:
|
|
||||||
left = "<span></span>"
|
|
||||||
|
|
||||||
if next and body.select_one(".ilc_page_rnav_RightNavigation"):
|
|
||||||
text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip()
|
|
||||||
right = f'<a href="{next}">{text}</a>'
|
|
||||||
else:
|
|
||||||
right = "<span></span>"
|
|
||||||
|
|
||||||
if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"):
|
|
||||||
top_nav.replace_with(
|
|
||||||
soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode())
|
|
||||||
)
|
|
||||||
|
|
||||||
if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"):
|
|
||||||
bot_nav.replace_with(soupify(nav_template.replace(
|
|
||||||
"{{left}}", left).replace("{{right}}", right).encode())
|
|
||||||
)
|
|
||||||
|
|
||||||
body = body.prettify()
|
|
||||||
return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name)
|
|
||||||
|
|
||||||
|
|
||||||
class Links(Enum):
|
class Links(Enum):
|
||||||
IGNORE = "ignore"
|
IGNORE = "ignore"
|
||||||
@ -171,24 +102,24 @@ class Links(Enum):
|
|||||||
INTERNET_SHORTCUT = "internet-shortcut"
|
INTERNET_SHORTCUT = "internet-shortcut"
|
||||||
|
|
||||||
def template(self) -> Optional[str]:
|
def template(self) -> Optional[str]:
|
||||||
if self == Links.FANCY:
|
if self == self.FANCY:
|
||||||
return _link_template_fancy
|
return _link_template_fancy
|
||||||
elif self == Links.PLAINTEXT:
|
elif self == self.PLAINTEXT:
|
||||||
return _link_template_plain
|
return _link_template_plain
|
||||||
elif self == Links.INTERNET_SHORTCUT:
|
elif self == self.INTERNET_SHORTCUT:
|
||||||
return _link_template_internet_shortcut
|
return _link_template_internet_shortcut
|
||||||
elif self == Links.IGNORE:
|
elif self == self.IGNORE:
|
||||||
return None
|
return None
|
||||||
raise ValueError("Missing switch case")
|
raise ValueError("Missing switch case")
|
||||||
|
|
||||||
def extension(self) -> Optional[str]:
|
def extension(self) -> Optional[str]:
|
||||||
if self == Links.FANCY:
|
if self == self.FANCY:
|
||||||
return ".html"
|
return ".html"
|
||||||
elif self == Links.PLAINTEXT:
|
elif self == self.PLAINTEXT:
|
||||||
return ".txt"
|
return ".txt"
|
||||||
elif self == Links.INTERNET_SHORTCUT:
|
elif self == self.INTERNET_SHORTCUT:
|
||||||
return ".url"
|
return ".url"
|
||||||
elif self == Links.IGNORE:
|
elif self == self.IGNORE:
|
||||||
return None
|
return None
|
||||||
raise ValueError("Missing switch case")
|
raise ValueError("Missing switch case")
|
||||||
|
|
||||||
|
@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup:
|
|||||||
dummy.decompose()
|
dummy.decompose()
|
||||||
if len(children) > 1:
|
if len(children) > 1:
|
||||||
continue
|
continue
|
||||||
if isinstance(type(children[0]), Comment):
|
if type(children[0]) == Comment:
|
||||||
dummy.decompose()
|
dummy.decompose()
|
||||||
|
|
||||||
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
|
for hrule_imposter in soup.find_all(class_="ilc_section_Separator"):
|
||||||
|
@ -22,10 +22,8 @@ class IliasElementType(Enum):
|
|||||||
FOLDER = "folder"
|
FOLDER = "folder"
|
||||||
FORUM = "forum"
|
FORUM = "forum"
|
||||||
LINK = "link"
|
LINK = "link"
|
||||||
LEARNING_MODULE = "learning_module"
|
|
||||||
BOOKING = "booking"
|
BOOKING = "booking"
|
||||||
MEETING = "meeting"
|
MEETING = "meeting"
|
||||||
SURVEY = "survey"
|
|
||||||
VIDEO = "video"
|
VIDEO = "video"
|
||||||
VIDEO_PLAYER = "video_player"
|
VIDEO_PLAYER = "video_player"
|
||||||
VIDEO_FOLDER = "video_folder"
|
VIDEO_FOLDER = "video_folder"
|
||||||
@ -72,14 +70,6 @@ class IliasForumThread:
|
|||||||
mtime: Optional[datetime]
|
mtime: Optional[datetime]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class IliasLearningModulePage:
|
|
||||||
title: str
|
|
||||||
content: Tag
|
|
||||||
next_url: Optional[str]
|
|
||||||
previous_url: Optional[str]
|
|
||||||
|
|
||||||
|
|
||||||
class IliasPage:
|
class IliasPage:
|
||||||
|
|
||||||
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
|
def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]):
|
||||||
@ -88,16 +78,6 @@ class IliasPage:
|
|||||||
self._page_type = source_element.type if source_element else None
|
self._page_type = source_element.type if source_element else None
|
||||||
self._source_name = source_element.name if source_element else ""
|
self._source_name = source_element.name if source_element else ""
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_root_page(soup: BeautifulSoup) -> bool:
|
|
||||||
permalink = soup.find(id="current_perma_link")
|
|
||||||
if permalink is None:
|
|
||||||
return False
|
|
||||||
value = permalink.attrs.get("value")
|
|
||||||
if value is None:
|
|
||||||
return False
|
|
||||||
return "goto.php?target=root_" in value
|
|
||||||
|
|
||||||
def get_child_elements(self) -> List[IliasPageElement]:
|
def get_child_elements(self) -> List[IliasPageElement]:
|
||||||
"""
|
"""
|
||||||
Return all child page elements you can find here.
|
Return all child page elements you can find here.
|
||||||
@ -145,34 +125,6 @@ class IliasPage:
|
|||||||
|
|
||||||
return BeautifulSoup(raw_html, "html.parser")
|
return BeautifulSoup(raw_html, "html.parser")
|
||||||
|
|
||||||
def get_learning_module_data(self) -> Optional[IliasLearningModulePage]:
|
|
||||||
if not self._is_learning_module_page():
|
|
||||||
return None
|
|
||||||
content = self._soup.select_one("#ilLMPageContent")
|
|
||||||
title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip()
|
|
||||||
return IliasLearningModulePage(
|
|
||||||
title=title,
|
|
||||||
content=content,
|
|
||||||
next_url=self._find_learning_module_next(),
|
|
||||||
previous_url=self._find_learning_module_prev()
|
|
||||||
)
|
|
||||||
|
|
||||||
def _find_learning_module_next(self) -> Optional[str]:
|
|
||||||
for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"):
|
|
||||||
url = self._abs_url_from_link(link)
|
|
||||||
if "baseClass=ilLMPresentationGUI" not in url:
|
|
||||||
continue
|
|
||||||
return url
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _find_learning_module_prev(self) -> Optional[str]:
|
|
||||||
for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"):
|
|
||||||
url = self._abs_url_from_link(link)
|
|
||||||
if "baseClass=ilLMPresentationGUI" not in url:
|
|
||||||
continue
|
|
||||||
return url
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
|
def get_download_forum_data(self) -> Optional[IliasDownloadForumData]:
|
||||||
form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x})
|
form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x})
|
||||||
if not form:
|
if not form:
|
||||||
@ -181,7 +133,7 @@ class IliasPage:
|
|||||||
|
|
||||||
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
||||||
|
|
||||||
form_data: Dict[str, Union[str, List[str]]] = {
|
form_data: Dict[str, Union[str, List[ſtr]]] = {
|
||||||
"thread_ids[]": thread_ids,
|
"thread_ids[]": thread_ids,
|
||||||
"selected_cmd2": "html",
|
"selected_cmd2": "html",
|
||||||
"select_cmd2": "Ausführen",
|
"select_cmd2": "Ausführen",
|
||||||
@ -205,8 +157,6 @@ class IliasPage:
|
|||||||
if self._contains_collapsed_future_meetings():
|
if self._contains_collapsed_future_meetings():
|
||||||
log.explain("Requesting *all* future meetings")
|
log.explain("Requesting *all* future meetings")
|
||||||
return self._uncollapse_future_meetings_url()
|
return self._uncollapse_future_meetings_url()
|
||||||
if not self._is_content_tab_selected():
|
|
||||||
return self._select_content_page_url()
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _is_forum_page(self) -> bool:
|
def _is_forum_page(self) -> bool:
|
||||||
@ -259,12 +209,6 @@ class IliasPage:
|
|||||||
return False
|
return False
|
||||||
return "target=copa_" in link.get("value")
|
return "target=copa_" in link.get("value")
|
||||||
|
|
||||||
def _is_learning_module_page(self) -> bool:
|
|
||||||
link = self._soup.find(id="current_perma_link")
|
|
||||||
if not link:
|
|
||||||
return False
|
|
||||||
return "target=pg_" in link.get("value")
|
|
||||||
|
|
||||||
def _contains_collapsed_future_meetings(self) -> bool:
|
def _contains_collapsed_future_meetings(self) -> bool:
|
||||||
return self._uncollapse_future_meetings_url() is not None
|
return self._uncollapse_future_meetings_url() is not None
|
||||||
|
|
||||||
@ -275,27 +219,6 @@ class IliasPage:
|
|||||||
link = self._abs_url_from_link(element)
|
link = self._abs_url_from_link(element)
|
||||||
return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
|
return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
|
||||||
|
|
||||||
def _is_content_tab_selected(self) -> bool:
|
|
||||||
return self._select_content_page_url() is None
|
|
||||||
|
|
||||||
def _select_content_page_url(self) -> Optional[IliasPageElement]:
|
|
||||||
tab = self._soup.find(
|
|
||||||
id="tab_view_content",
|
|
||||||
attrs={"class": lambda x: x is not None and "active" not in x}
|
|
||||||
)
|
|
||||||
# Already selected (or not found)
|
|
||||||
if not tab:
|
|
||||||
return None
|
|
||||||
link = tab.find("a")
|
|
||||||
if link:
|
|
||||||
link = self._abs_url_from_link(link)
|
|
||||||
return IliasPageElement(IliasElementType.FOLDER, link, "select content page")
|
|
||||||
|
|
||||||
_unexpected_html_warning()
|
|
||||||
log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.")
|
|
||||||
log.warn_contd("PFERD might not find content on the course's main page.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _player_to_video(self) -> List[IliasPageElement]:
|
def _player_to_video(self) -> List[IliasPageElement]:
|
||||||
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
||||||
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
||||||
@ -442,7 +365,7 @@ class IliasPage:
|
|||||||
"""
|
"""
|
||||||
# Video start links are marked with an "Abspielen" link
|
# Video start links are marked with an "Abspielen" link
|
||||||
video_links: List[Tag] = self._soup.findAll(
|
video_links: List[Tag] = self._soup.findAll(
|
||||||
name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
|
name="a", text=re.compile(r"\s*Abspielen\s*")
|
||||||
)
|
)
|
||||||
|
|
||||||
results: List[IliasPageElement] = []
|
results: List[IliasPageElement] = []
|
||||||
@ -761,11 +684,7 @@ class IliasPage:
|
|||||||
"div",
|
"div",
|
||||||
attrs={"class": lambda x: x and "caption" in x},
|
attrs={"class": lambda x: x and "caption" in x},
|
||||||
)
|
)
|
||||||
caption_container = caption_parent.find_next_sibling("div")
|
description = caption_parent.find_next_sibling("div").getText().strip()
|
||||||
if caption_container:
|
|
||||||
description = caption_container.getText().strip()
|
|
||||||
else:
|
|
||||||
description = None
|
|
||||||
|
|
||||||
if not type:
|
if not type:
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
@ -795,7 +714,7 @@ class IliasPage:
|
|||||||
|
|
||||||
icon: Tag = card_root.select_one(".il-card-repository-head .icon")
|
icon: Tag = card_root.select_one(".il-card-repository-head .icon")
|
||||||
|
|
||||||
if "opencast" in icon["class"] or "xoct" in icon["class"]:
|
if "opencast" in icon["class"]:
|
||||||
return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED
|
return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED
|
||||||
if "exc" in icon["class"]:
|
if "exc" in icon["class"]:
|
||||||
return IliasElementType.EXERCISE
|
return IliasElementType.EXERCISE
|
||||||
@ -811,12 +730,6 @@ class IliasPage:
|
|||||||
return IliasElementType.TEST
|
return IliasElementType.TEST
|
||||||
if "fold" in icon["class"]:
|
if "fold" in icon["class"]:
|
||||||
return IliasElementType.FOLDER
|
return IliasElementType.FOLDER
|
||||||
if "copa" in icon["class"]:
|
|
||||||
return IliasElementType.FOLDER
|
|
||||||
if "svy" in icon["class"]:
|
|
||||||
return IliasElementType.SURVEY
|
|
||||||
if "file" in icon["class"]:
|
|
||||||
return IliasElementType.FILE
|
|
||||||
|
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
||||||
@ -855,9 +768,6 @@ class IliasPage:
|
|||||||
if "cmdClass=ilobjtestgui" in parsed_url.query:
|
if "cmdClass=ilobjtestgui" in parsed_url.query:
|
||||||
return IliasElementType.TEST
|
return IliasElementType.TEST
|
||||||
|
|
||||||
if "baseClass=ilLMPresentationGUI" in parsed_url.query:
|
|
||||||
return IliasElementType.LEARNING_MODULE
|
|
||||||
|
|
||||||
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
||||||
# try to guess it from the image.
|
# try to guess it from the image.
|
||||||
|
|
||||||
|
@ -1,11 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
from collections.abc import Awaitable, Coroutine
|
from collections.abc import Awaitable, Coroutine
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast
|
from typing import Any, Callable, Dict, List, Optional, Set, Union, cast
|
||||||
from urllib.parse import urljoin
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import yarl
|
import yarl
|
||||||
@ -19,10 +16,10 @@ from ...output_dir import FileSink, Redownload
|
|||||||
from ...utils import fmt_path, soupify, url_set_query_param
|
from ...utils import fmt_path, soupify, url_set_query_param
|
||||||
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
|
||||||
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
from ..http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
from .file_templates import Links, learning_module_template
|
from .file_templates import Links
|
||||||
from .ilias_html_cleaner import clean, insert_base_markup
|
from .ilias_html_cleaner import clean, insert_base_markup
|
||||||
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage,
|
from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement,
|
||||||
IliasPageElement, _sanitize_path_name, parse_ilias_forum_export)
|
_sanitize_path_name, parse_ilias_forum_export)
|
||||||
|
|
||||||
TargetType = Union[str, int]
|
TargetType = Union[str, int]
|
||||||
|
|
||||||
@ -197,7 +194,7 @@ instance's greatest bottleneck.
|
|||||||
self._links = section.links()
|
self._links = section.links()
|
||||||
self._videos = section.videos()
|
self._videos = section.videos()
|
||||||
self._forums = section.forums()
|
self._forums = section.forums()
|
||||||
self._visited_urls: Dict[str, PurePath] = dict()
|
self._visited_urls: Set[str] = set()
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
if isinstance(self._target, int):
|
if isinstance(self._target, int):
|
||||||
@ -242,7 +239,7 @@ instance's greatest bottleneck.
|
|||||||
|
|
||||||
# Duplicated code, but the root page is special - we want to avoid fetching it twice!
|
# Duplicated code, but the root page is special - we want to avoid fetching it twice!
|
||||||
while next_stage_url:
|
while next_stage_url:
|
||||||
soup = await self._get_page(next_stage_url, root_page_allowed=True)
|
soup = await self._get_page(next_stage_url)
|
||||||
|
|
||||||
if current_parent is None and expected_id is not None:
|
if current_parent is None and expected_id is not None:
|
||||||
perma_link_element: Tag = soup.find(id="current_perma_link")
|
perma_link_element: Tag = soup.find(id="current_perma_link")
|
||||||
@ -351,11 +348,9 @@ instance's greatest bottleneck.
|
|||||||
) -> Optional[Coroutine[Any, Any, None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
if element.url in self._visited_urls:
|
if element.url in self._visited_urls:
|
||||||
raise CrawlWarning(
|
raise CrawlWarning(
|
||||||
f"Found second path to element {element.name!r} at {element.url!r}. "
|
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
|
||||||
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
|
|
||||||
+ f"Second path: {fmt_path(parent_path)}."
|
|
||||||
)
|
)
|
||||||
self._visited_urls[element.url] = parent_path
|
self._visited_urls.add(element.url)
|
||||||
|
|
||||||
element_path = PurePath(parent_path, element.name)
|
element_path = PurePath(parent_path, element.name)
|
||||||
|
|
||||||
@ -382,23 +377,10 @@ instance's greatest bottleneck.
|
|||||||
return None
|
return None
|
||||||
return await self._handle_forum(element, element_path)
|
return await self._handle_forum(element, element_path)
|
||||||
elif element.type == IliasElementType.TEST:
|
elif element.type == IliasElementType.TEST:
|
||||||
log.status(
|
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
|
||||||
"[bold bright_black]",
|
log.explain("Tests contain no relevant files")
|
||||||
"Ignored",
|
log.explain("Answer: No")
|
||||||
fmt_path(element_path),
|
|
||||||
"[bright_black](tests contain no relevant data)"
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
elif element.type == IliasElementType.SURVEY:
|
|
||||||
log.status(
|
|
||||||
"[bold bright_black]",
|
|
||||||
"Ignored",
|
|
||||||
fmt_path(element_path),
|
|
||||||
"[bright_black](surveys contain no relevant data)"
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
elif element.type == IliasElementType.LEARNING_MODULE:
|
|
||||||
return await self._handle_learning_module(element, element_path)
|
|
||||||
elif element.type == IliasElementType.LINK:
|
elif element.type == IliasElementType.LINK:
|
||||||
return await self._handle_link(element, element_path)
|
return await self._handle_link(element, element_path)
|
||||||
elif element.type == IliasElementType.BOOKING:
|
elif element.type == IliasElementType.BOOKING:
|
||||||
@ -744,141 +726,12 @@ instance's greatest bottleneck.
|
|||||||
sink.file.write(content.encode("utf-8"))
|
sink.file.write(content.encode("utf-8"))
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def _handle_learning_module(
|
async def _get_page(self, url: str) -> BeautifulSoup:
|
||||||
self,
|
|
||||||
element: IliasPageElement,
|
|
||||||
element_path: PurePath,
|
|
||||||
) -> Optional[Coroutine[Any, Any, None]]:
|
|
||||||
maybe_cl = await self.crawl(element_path)
|
|
||||||
if not maybe_cl:
|
|
||||||
return None
|
|
||||||
return self._crawl_learning_module(element, maybe_cl)
|
|
||||||
|
|
||||||
@_iorepeat(3, "crawling learning module")
|
|
||||||
@anoncritical
|
|
||||||
async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None:
|
|
||||||
elements: List[IliasLearningModulePage] = []
|
|
||||||
|
|
||||||
async with cl:
|
|
||||||
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
|
|
||||||
log.explain(f"URL: {element.url}")
|
|
||||||
soup = await self._get_page(element.url)
|
|
||||||
page = IliasPage(soup, element.url, None)
|
|
||||||
if next := page.get_learning_module_data():
|
|
||||||
elements.extend(await self._crawl_learning_module_direction(
|
|
||||||
cl.path, next.previous_url, "left"
|
|
||||||
))
|
|
||||||
elements.append(next)
|
|
||||||
elements.extend(await self._crawl_learning_module_direction(
|
|
||||||
cl.path, next.next_url, "right"
|
|
||||||
))
|
|
||||||
|
|
||||||
# Reflect their natural ordering in the file names
|
|
||||||
for index, lm_element in enumerate(elements):
|
|
||||||
lm_element.title = f"{index:02}_{lm_element.title}"
|
|
||||||
|
|
||||||
tasks: List[Awaitable[None]] = []
|
|
||||||
for index, elem in enumerate(elements):
|
|
||||||
prev_url = elements[index - 1].title if index > 0 else None
|
|
||||||
next_url = elements[index + 1].title if index < len(elements) - 1 else None
|
|
||||||
tasks.append(asyncio.create_task(
|
|
||||||
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
|
|
||||||
))
|
|
||||||
|
|
||||||
# And execute them
|
|
||||||
await self.gather(tasks)
|
|
||||||
|
|
||||||
async def _crawl_learning_module_direction(
|
|
||||||
self,
|
|
||||||
path: PurePath,
|
|
||||||
start_url: Optional[str],
|
|
||||||
dir: Union[Literal["left"], Literal["right"]]
|
|
||||||
) -> List[IliasLearningModulePage]:
|
|
||||||
elements: List[IliasLearningModulePage] = []
|
|
||||||
|
|
||||||
if not start_url:
|
|
||||||
return elements
|
|
||||||
|
|
||||||
next_element_url: Optional[str] = start_url
|
|
||||||
counter = 0
|
|
||||||
while next_element_url:
|
|
||||||
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
|
|
||||||
log.explain(f"URL: {next_element_url}")
|
|
||||||
soup = await self._get_page(next_element_url)
|
|
||||||
page = IliasPage(soup, next_element_url, None)
|
|
||||||
if next := page.get_learning_module_data():
|
|
||||||
elements.append(next)
|
|
||||||
if dir == "left":
|
|
||||||
next_element_url = next.previous_url
|
|
||||||
else:
|
|
||||||
next_element_url = next.next_url
|
|
||||||
counter += 1
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
@anoncritical
|
|
||||||
@_iorepeat(3, "saving learning module page")
|
|
||||||
async def _download_learning_module_page(
|
|
||||||
self,
|
|
||||||
parent_path: PurePath,
|
|
||||||
element: IliasLearningModulePage,
|
|
||||||
prev: Optional[str],
|
|
||||||
next: Optional[str]
|
|
||||||
) -> None:
|
|
||||||
path = parent_path / (_sanitize_path_name(element.title) + ".html")
|
|
||||||
maybe_dl = await self.download(path)
|
|
||||||
if not maybe_dl:
|
|
||||||
return
|
|
||||||
my_path = self._transformer.transform(maybe_dl.path)
|
|
||||||
if not my_path:
|
|
||||||
return
|
|
||||||
|
|
||||||
if prev:
|
|
||||||
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
|
|
||||||
if prev_p:
|
|
||||||
prev = os.path.relpath(prev_p, my_path.parent)
|
|
||||||
else:
|
|
||||||
prev = None
|
|
||||||
if next:
|
|
||||||
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
|
|
||||||
if next_p:
|
|
||||||
next = os.path.relpath(next_p, my_path.parent)
|
|
||||||
else:
|
|
||||||
next = None
|
|
||||||
|
|
||||||
async with maybe_dl as (bar, sink):
|
|
||||||
content = element.content
|
|
||||||
content = await self.internalize_images(content)
|
|
||||||
sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
|
|
||||||
sink.done()
|
|
||||||
|
|
||||||
async def internalize_images(self, tag: Tag) -> Tag:
|
|
||||||
"""
|
|
||||||
Tries to fetch ILIAS images and embed them as base64 data.
|
|
||||||
"""
|
|
||||||
log.explain_topic("Internalizing images")
|
|
||||||
for elem in tag.find_all(recursive=True):
|
|
||||||
if not isinstance(elem, Tag):
|
|
||||||
continue
|
|
||||||
if elem.name == "img":
|
|
||||||
if src := elem.attrs.get("src", None):
|
|
||||||
url = urljoin(_ILIAS_URL, src)
|
|
||||||
if not url.startswith(_ILIAS_URL):
|
|
||||||
continue
|
|
||||||
log.explain(f"Internalizing {url!r}")
|
|
||||||
img = await self._get_authenticated(url)
|
|
||||||
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
|
|
||||||
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
|
|
||||||
# For unknown reasons the protocol seems to be stripped.
|
|
||||||
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
|
||||||
return tag
|
|
||||||
|
|
||||||
async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup:
|
|
||||||
auth_id = await self._current_auth_id()
|
auth_id = await self._current_auth_id()
|
||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return self._verify_page(soup, url, root_page_allowed)
|
return soup
|
||||||
|
|
||||||
# We weren't authenticated, so try to do that
|
# We weren't authenticated, so try to do that
|
||||||
await self.authenticate(auth_id)
|
await self.authenticate(auth_id)
|
||||||
@ -887,26 +740,14 @@ instance's greatest bottleneck.
|
|||||||
async with self.session.get(url) as request:
|
async with self.session.get(url) as request:
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return self._verify_page(soup, url, root_page_allowed)
|
|
||||||
raise CrawlError("get_page failed even after authenticating")
|
|
||||||
|
|
||||||
def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
|
||||||
if IliasPage.is_root_page(soup) and not root_page_allowed:
|
|
||||||
raise CrawlError(
|
|
||||||
"Unexpectedly encountered ILIAS root page. "
|
|
||||||
"This usually happens because the ILIAS instance is broken. "
|
|
||||||
"If so, wait a day or two and try again. "
|
|
||||||
"It could also happen because a crawled element links to the ILIAS root page. "
|
|
||||||
"If so, use a transform with a ! as target to ignore the particular element. "
|
|
||||||
f"The redirect came from {url}"
|
|
||||||
)
|
|
||||||
return soup
|
return soup
|
||||||
|
raise CrawlError("get_page failed even after authenticating")
|
||||||
|
|
||||||
async def _post_authenticated(
|
async def _post_authenticated(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
data: dict[str, Union[str, List[str]]]
|
data: dict[str, Union[str, List[str]]]
|
||||||
) -> bytes:
|
) -> BeautifulSoup:
|
||||||
auth_id = await self._current_auth_id()
|
auth_id = await self._current_auth_id()
|
||||||
|
|
||||||
form_data = aiohttp.FormData()
|
form_data = aiohttp.FormData()
|
||||||
@ -926,22 +767,6 @@ instance's greatest bottleneck.
|
|||||||
return await request.read()
|
return await request.read()
|
||||||
raise CrawlError("post_authenticated failed even after authenticating")
|
raise CrawlError("post_authenticated failed even after authenticating")
|
||||||
|
|
||||||
async def _get_authenticated(self, url: str) -> bytes:
|
|
||||||
auth_id = await self._current_auth_id()
|
|
||||||
|
|
||||||
async with self.session.get(url, allow_redirects=False) as request:
|
|
||||||
if request.status == 200:
|
|
||||||
return await request.read()
|
|
||||||
|
|
||||||
# We weren't authenticated, so try to do that
|
|
||||||
await self.authenticate(auth_id)
|
|
||||||
|
|
||||||
# Retry once after authenticating. If this fails, we will die.
|
|
||||||
async with self.session.get(url, allow_redirects=False) as request:
|
|
||||||
if request.status == 200:
|
|
||||||
return await request.read()
|
|
||||||
raise CrawlError("get_authenticated failed even after authenticating")
|
|
||||||
|
|
||||||
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
|
# We repeat this as the login method in shibboleth doesn't handle I/O errors.
|
||||||
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
|
# Shibboleth is quite reliable as well, the repeat is likely not critical here.
|
||||||
@ _iorepeat(3, "Login", failure_is_error=True)
|
@ _iorepeat(3, "Login", failure_is_error=True)
|
||||||
|
@ -2,7 +2,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
from typing import Awaitable, List, Optional, Pattern, Set, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
await self._stream_from_url(file.url, sink, bar)
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||||
page, url = await self.get_page()
|
page = await self.get_page()
|
||||||
elements: List[Tag] = self._find_file_links(page)
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
folder_label = self._find_folder_label(element)
|
folder_label = self._find_folder_label(element)
|
||||||
if folder_label:
|
if folder_label:
|
||||||
folder = self._extract_folder(folder_label, url)
|
folder = self._extract_folder(folder_label)
|
||||||
if folder not in items:
|
if folder not in items:
|
||||||
items.add(folder)
|
items.add(folder)
|
||||||
folder.explain()
|
folder.explain()
|
||||||
else:
|
else:
|
||||||
file = self._extract_file(element, url)
|
file = self._extract_file(element)
|
||||||
items.add(file)
|
items.add(file)
|
||||||
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
||||||
log.explain("Attributing it to root folder")
|
log.explain("Attributing it to root folder")
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
name = folder_tag.getText().strip()
|
name = folder_tag.getText().strip()
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="table")
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
for link in self._find_file_links(container):
|
for link in self._find_file_links(container):
|
||||||
files.append(self._extract_file(link, url))
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
return None
|
return None
|
||||||
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
|
|
||||||
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
|
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||||
url = self._abs_url_from_link(url, link)
|
url = self._abs_url_from_link(link)
|
||||||
name = os.path.basename(url)
|
name = os.path.basename(url)
|
||||||
return KitIpdFile(name, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||||
|
|
||||||
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||||
return urljoin(url, link_tag.get("href"))
|
return urljoin(self._url, link_tag.get("href"))
|
||||||
|
|
||||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
async with self.session.get(url, allow_redirects=False) as resp:
|
async with self.session.get(url, allow_redirects=False) as resp:
|
||||||
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
async def get_page(self) -> BeautifulSoup:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
# cause issues on other pages.
|
# cause issues on other pages.
|
||||||
content = (await request.read()).decode("utf-8")
|
content = (await request.read()).decode("utf-8")
|
||||||
content = re.sub(r"<!--.*?-->", "", content)
|
content = re.sub(r"<!--.*?-->", "", content)
|
||||||
return soupify(content.encode("utf-8")), str(request.url)
|
return soupify(content.encode("utf-8"))
|
||||||
|
@ -59,7 +59,6 @@ class Log:
|
|||||||
# Whether different parts of the output are enabled or disabled
|
# Whether different parts of the output are enabled or disabled
|
||||||
self.output_explain = False
|
self.output_explain = False
|
||||||
self.output_status = True
|
self.output_status = True
|
||||||
self.output_not_deleted = True
|
|
||||||
self.output_report = True
|
self.output_report = True
|
||||||
|
|
||||||
def _update_live(self) -> None:
|
def _update_live(self) -> None:
|
||||||
@ -208,17 +207,6 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
|
|||||||
action = escape(f"{action:<{self.STATUS_WIDTH}}")
|
action = escape(f"{action:<{self.STATUS_WIDTH}}")
|
||||||
self.print(f"{style}{action}[/] {escape(text)} {suffix}")
|
self.print(f"{style}{action}[/] {escape(text)} {suffix}")
|
||||||
|
|
||||||
def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None:
|
|
||||||
"""
|
|
||||||
Print a message for a local only file that wasn't
|
|
||||||
deleted while crawling. Allows markup in the "style"
|
|
||||||
argument which will be applied to the "action" string.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self.output_status and self.output_not_deleted:
|
|
||||||
action = escape(f"{action:<{self.STATUS_WIDTH}}")
|
|
||||||
self.print(f"{style}{action}[/] {escape(text)} {suffix}")
|
|
||||||
|
|
||||||
def report(self, text: str) -> None:
|
def report(self, text: str) -> None:
|
||||||
"""
|
"""
|
||||||
Print a report after crawling. Allows markup.
|
Print a report after crawling. Allows markup.
|
||||||
@ -227,14 +215,6 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
|
|||||||
if self.output_report:
|
if self.output_report:
|
||||||
self.print(text)
|
self.print(text)
|
||||||
|
|
||||||
def report_not_deleted(self, text: str) -> None:
|
|
||||||
"""
|
|
||||||
Print a report for a local only file that wasn't deleted after crawling. Allows markup.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self.output_report and self.output_not_deleted:
|
|
||||||
self.print(text)
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _bar(
|
def _bar(
|
||||||
self,
|
self,
|
||||||
|
@ -44,7 +44,6 @@ class OnConflict(Enum):
|
|||||||
LOCAL_FIRST = "local-first"
|
LOCAL_FIRST = "local-first"
|
||||||
REMOTE_FIRST = "remote-first"
|
REMOTE_FIRST = "remote-first"
|
||||||
NO_DELETE = "no-delete"
|
NO_DELETE = "no-delete"
|
||||||
NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite"
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_string(string: str) -> "OnConflict":
|
def from_string(string: str) -> "OnConflict":
|
||||||
@ -52,7 +51,7 @@ class OnConflict(Enum):
|
|||||||
return OnConflict(string)
|
return OnConflict(string)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError("must be one of 'prompt', 'local-first',"
|
raise ValueError("must be one of 'prompt', 'local-first',"
|
||||||
" 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'")
|
" 'remote-first', 'no-delete'")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -265,7 +264,7 @@ class OutputDirectory:
|
|||||||
on_conflict: OnConflict,
|
on_conflict: OnConflict,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}:
|
if on_conflict == OnConflict.PROMPT:
|
||||||
async with log.exclusive_output():
|
async with log.exclusive_output():
|
||||||
prompt = f"Replace {fmt_path(path)} with remote file?"
|
prompt = f"Replace {fmt_path(path)} with remote file?"
|
||||||
return await prompt_yes_no(prompt, default=False)
|
return await prompt_yes_no(prompt, default=False)
|
||||||
@ -284,7 +283,7 @@ class OutputDirectory:
|
|||||||
on_conflict: OnConflict,
|
on_conflict: OnConflict,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}:
|
if on_conflict == OnConflict.PROMPT:
|
||||||
async with log.exclusive_output():
|
async with log.exclusive_output():
|
||||||
prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?"
|
prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?"
|
||||||
return await prompt_yes_no(prompt, default=False)
|
return await prompt_yes_no(prompt, default=False)
|
||||||
@ -304,7 +303,7 @@ class OutputDirectory:
|
|||||||
path: PurePath,
|
path: PurePath,
|
||||||
parent: PurePath,
|
parent: PurePath,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}:
|
if on_conflict == OnConflict.PROMPT:
|
||||||
async with log.exclusive_output():
|
async with log.exclusive_output():
|
||||||
prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?"
|
prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?"
|
||||||
return await prompt_yes_no(prompt, default=False)
|
return await prompt_yes_no(prompt, default=False)
|
||||||
@ -331,7 +330,7 @@ class OutputDirectory:
|
|||||||
return False
|
return False
|
||||||
elif on_conflict == OnConflict.REMOTE_FIRST:
|
elif on_conflict == OnConflict.REMOTE_FIRST:
|
||||||
return True
|
return True
|
||||||
elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}:
|
elif on_conflict == OnConflict.NO_DELETE:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# This should never be reached
|
# This should never be reached
|
||||||
@ -496,7 +495,7 @@ class OutputDirectory:
|
|||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure))
|
log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure))
|
||||||
self._report.not_delete_file(pure)
|
self._report.not_delete_file(pure)
|
||||||
|
|
||||||
def load_prev_report(self) -> None:
|
def load_prev_report(self) -> None:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Set
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
@ -43,24 +43,16 @@ class Pferd:
|
|||||||
|
|
||||||
crawl_sections = [name for name, _ in config.crawl_sections()]
|
crawl_sections = [name for name, _ in config.crawl_sections()]
|
||||||
|
|
||||||
crawlers_to_run = set() # With crawl: prefix
|
crawlers_to_run = [] # With crawl: prefix
|
||||||
unknown_names = [] # Without crawl: prefix
|
unknown_names = [] # Without crawl: prefix
|
||||||
|
|
||||||
for name in cli_crawlers:
|
for name in cli_crawlers:
|
||||||
section_name = f"crawl:{name}"
|
section_name = f"crawl:{name}"
|
||||||
if section_name in crawl_sections:
|
if section_name in crawl_sections:
|
||||||
log.explain(f"Crawler section named {section_name!r} exists")
|
log.explain(f"Crawler section named {section_name!r} exists")
|
||||||
crawlers_to_run.add(section_name)
|
crawlers_to_run.append(section_name)
|
||||||
# interprete name as alias of a crawler
|
else:
|
||||||
alias_names = self._find_crawlers_by_alias(name, config)
|
log.explain(f"There's no crawler section named {section_name!r}")
|
||||||
if alias_names:
|
|
||||||
crawlers_to_run.update(alias_names)
|
|
||||||
log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
|
|
||||||
for alias_name in alias_names:
|
|
||||||
log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
|
|
||||||
|
|
||||||
if not section_name in crawl_sections and not alias_names:
|
|
||||||
log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
|
|
||||||
unknown_names.append(name)
|
unknown_names.append(name)
|
||||||
|
|
||||||
if unknown_names:
|
if unknown_names:
|
||||||
@ -73,14 +65,6 @@ class Pferd:
|
|||||||
|
|
||||||
return crawlers_to_run
|
return crawlers_to_run
|
||||||
|
|
||||||
def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
|
|
||||||
alias_names = set()
|
|
||||||
for (section_name, section) in config.crawl_sections():
|
|
||||||
section_aliases = section.get("aliases", [])
|
|
||||||
if alias in section_aliases:
|
|
||||||
alias_names.add(section_name)
|
|
||||||
return alias_names
|
|
||||||
|
|
||||||
def _find_crawlers_to_run(
|
def _find_crawlers_to_run(
|
||||||
self,
|
self,
|
||||||
config: Config,
|
config: Config,
|
||||||
@ -196,7 +180,7 @@ class Pferd:
|
|||||||
log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}")
|
log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}")
|
||||||
for path in sorted(crawler.report.not_deleted_files):
|
for path in sorted(crawler.report.not_deleted_files):
|
||||||
something_changed = True
|
something_changed = True
|
||||||
log.report_not_deleted(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}")
|
log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}")
|
||||||
|
|
||||||
for warning in crawler.report.encountered_warnings:
|
for warning in crawler.report.encountered_warnings:
|
||||||
something_changed = True
|
something_changed = True
|
||||||
|
53
PFERD/update.py
Normal file
53
PFERD/update.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
import ssl
|
||||||
|
from typing import Optional
|
||||||
|
import aiohttp
|
||||||
|
import certifi
|
||||||
|
|
||||||
|
from .version import NAME, VERSION
|
||||||
|
from .logging import log
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PferdUpdate:
|
||||||
|
release_url: str
|
||||||
|
version: str
|
||||||
|
|
||||||
|
|
||||||
|
def _build_session() -> aiohttp.ClientSession:
|
||||||
|
return aiohttp.ClientSession(
|
||||||
|
headers={"User-Agent": f"{NAME}/{VERSION}"},
|
||||||
|
connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
|
||||||
|
timeout=aiohttp.ClientTimeout(
|
||||||
|
total=15 * 60,
|
||||||
|
connect=10,
|
||||||
|
sock_connect=10,
|
||||||
|
sock_read=10,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def check_for_updates() -> None:
|
||||||
|
if new_version := await get_newer_version():
|
||||||
|
log.warn(
|
||||||
|
f"{NAME} version out of date. "
|
||||||
|
+ f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
|
||||||
|
)
|
||||||
|
log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
|
||||||
|
else:
|
||||||
|
log.explain("No update found")
|
||||||
|
|
||||||
|
|
||||||
|
async def get_newer_version() -> Optional[PferdUpdate]:
|
||||||
|
async with _build_session() as session:
|
||||||
|
async with session.get(
|
||||||
|
"https://api.github.com/repos/Garmelon/Pferd/releases/latest",
|
||||||
|
headers={"Accept": "application/vnd.github+json"}
|
||||||
|
) as response:
|
||||||
|
release_information = await response.json()
|
||||||
|
tag_name: str = release_information["tag_name"]
|
||||||
|
tag_name = tag_name.removeprefix("v")
|
||||||
|
if VERSION == tag_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return PferdUpdate(release_url=release_information["html_url"], version=tag_name)
|
@ -1,2 +1,2 @@
|
|||||||
NAME = "PFERD"
|
NAME = "PFERD"
|
||||||
VERSION = "3.4.3"
|
VERSION = "3.4.1"
|
||||||
|
@ -30,10 +30,7 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
|
|||||||
|
|
||||||
Unofficial packages are available for:
|
Unofficial packages are available for:
|
||||||
- [AUR](https://aur.archlinux.org/packages/pferd)
|
- [AUR](https://aur.archlinux.org/packages/pferd)
|
||||||
- [brew](https://formulae.brew.sh/formula/pferd)
|
|
||||||
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
|
|
||||||
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
||||||
- [PyPi](https://pypi.org/project/pferd)
|
|
||||||
|
|
||||||
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user