mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Compare commits
12 Commits
update-che
...
v3.4.3
Author | SHA1 | Date | |
---|---|---|---|
6d44aac278 | |||
55a2de6b88 | |||
c0d6d8b229 | |||
635caa765d | |||
e69b55b349 | |||
07200bbde5 | |||
c020cccc64 | |||
259cfc20cc | |||
37b51a66d8 | |||
f47d2f11d8 | |||
1b6be6bd79 | |||
e1430e6298 |
23
CHANGELOG.md
23
CHANGELOG.md
@ -22,9 +22,30 @@ ambiguous situations.
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
## 3.4.3 - 2022-11-29
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Missing documentation for `forums` option
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Clear up error message shown when multiple paths are found to an element
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Forum crawling crashing when parsing empty (= 0 messages) threads
|
- IPD crawler unnecessarily appending trailing slashes
|
||||||
|
- Crawling opencast when ILIAS is set to English
|
||||||
|
|
||||||
|
## 3.4.2 - 2022-10-26
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Recognize and crawl content pages in cards
|
||||||
|
- Recognize and ignore surveys
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Forum crawling crashing when a thread has no messages at all
|
||||||
- Forum crawling crashing when a forum has no threads at all
|
- Forum crawling crashing when a forum has no threads at all
|
||||||
|
- Ilias login failing in some cases
|
||||||
|
- Crawling of paginated future meetings
|
||||||
|
- IPD crawler handling of URLs without trailing slash
|
||||||
|
|
||||||
## 3.4.1 - 2022-08-17
|
## 3.4.1 - 2022-08-17
|
||||||
|
|
||||||
|
@ -181,6 +181,7 @@ script once per day should be fine.
|
|||||||
redirect to the actual URL. Set to a negative value to disable the automatic
|
redirect to the actual URL. Set to a negative value to disable the automatic
|
||||||
redirect. (Default: `-1`)
|
redirect. (Default: `-1`)
|
||||||
- `videos`: Whether to download videos. (Default: `no`)
|
- `videos`: Whether to download videos. (Default: `no`)
|
||||||
|
- `forums`: Whether to download forum threads. (Default: `no`)
|
||||||
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
|
||||||
`20.0`)
|
`20.0`)
|
||||||
|
|
||||||
@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
|
|||||||
Example: `foo/bar --> baz`
|
Example: `foo/bar --> baz`
|
||||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||||
- Converts `foo/bar` into `baz`
|
- Converts `foo/bar` into `baz`
|
||||||
- Converts `foo/bar/wargl` into `bar/wargl`
|
- Converts `foo/bar/wargl` into `baz/wargl`
|
||||||
|
|
||||||
Example: `foo/bar --> !`
|
Example: `foo/bar --> !`
|
||||||
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
|
||||||
|
@ -24,6 +24,7 @@ class IliasElementType(Enum):
|
|||||||
LINK = "link"
|
LINK = "link"
|
||||||
BOOKING = "booking"
|
BOOKING = "booking"
|
||||||
MEETING = "meeting"
|
MEETING = "meeting"
|
||||||
|
SURVEY = "survey"
|
||||||
VIDEO = "video"
|
VIDEO = "video"
|
||||||
VIDEO_PLAYER = "video_player"
|
VIDEO_PLAYER = "video_player"
|
||||||
VIDEO_FOLDER = "video_folder"
|
VIDEO_FOLDER = "video_folder"
|
||||||
@ -133,7 +134,7 @@ class IliasPage:
|
|||||||
|
|
||||||
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
|
||||||
|
|
||||||
form_data: Dict[str, Union[str, List[ſtr]]] = {
|
form_data: Dict[str, Union[str, List[str]]] = {
|
||||||
"thread_ids[]": thread_ids,
|
"thread_ids[]": thread_ids,
|
||||||
"selected_cmd2": "html",
|
"selected_cmd2": "html",
|
||||||
"select_cmd2": "Ausführen",
|
"select_cmd2": "Ausführen",
|
||||||
@ -365,7 +366,7 @@ class IliasPage:
|
|||||||
"""
|
"""
|
||||||
# Video start links are marked with an "Abspielen" link
|
# Video start links are marked with an "Abspielen" link
|
||||||
video_links: List[Tag] = self._soup.findAll(
|
video_links: List[Tag] = self._soup.findAll(
|
||||||
name="a", text=re.compile(r"\s*Abspielen\s*")
|
name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
|
||||||
)
|
)
|
||||||
|
|
||||||
results: List[IliasPageElement] = []
|
results: List[IliasPageElement] = []
|
||||||
@ -730,6 +731,10 @@ class IliasPage:
|
|||||||
return IliasElementType.TEST
|
return IliasElementType.TEST
|
||||||
if "fold" in icon["class"]:
|
if "fold" in icon["class"]:
|
||||||
return IliasElementType.FOLDER
|
return IliasElementType.FOLDER
|
||||||
|
if "copa" in icon["class"]:
|
||||||
|
return IliasElementType.FOLDER
|
||||||
|
if "svy" in icon["class"]:
|
||||||
|
return IliasElementType.SURVEY
|
||||||
|
|
||||||
_unexpected_html_warning()
|
_unexpected_html_warning()
|
||||||
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
|
||||||
|
@ -194,7 +194,7 @@ instance's greatest bottleneck.
|
|||||||
self._links = section.links()
|
self._links = section.links()
|
||||||
self._videos = section.videos()
|
self._videos = section.videos()
|
||||||
self._forums = section.forums()
|
self._forums = section.forums()
|
||||||
self._visited_urls: Set[str] = set()
|
self._visited_urls: Dict[str, PurePath] = dict()
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
if isinstance(self._target, int):
|
if isinstance(self._target, int):
|
||||||
@ -348,9 +348,11 @@ instance's greatest bottleneck.
|
|||||||
) -> Optional[Coroutine[Any, Any, None]]:
|
) -> Optional[Coroutine[Any, Any, None]]:
|
||||||
if element.url in self._visited_urls:
|
if element.url in self._visited_urls:
|
||||||
raise CrawlWarning(
|
raise CrawlWarning(
|
||||||
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
|
f"Found second path to element {element.name!r} at {element.url!r}. "
|
||||||
|
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
|
||||||
|
+ f"Second path: {fmt_path(parent_path)}."
|
||||||
)
|
)
|
||||||
self._visited_urls.add(element.url)
|
self._visited_urls[element.url] = parent_path
|
||||||
|
|
||||||
element_path = PurePath(parent_path, element.name)
|
element_path = PurePath(parent_path, element.name)
|
||||||
|
|
||||||
@ -377,9 +379,20 @@ instance's greatest bottleneck.
|
|||||||
return None
|
return None
|
||||||
return await self._handle_forum(element, element_path)
|
return await self._handle_forum(element, element_path)
|
||||||
elif element.type == IliasElementType.TEST:
|
elif element.type == IliasElementType.TEST:
|
||||||
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
|
log.status(
|
||||||
log.explain("Tests contain no relevant files")
|
"[bold bright_black]",
|
||||||
log.explain("Answer: No")
|
"Ignored",
|
||||||
|
fmt_path(element_path),
|
||||||
|
"[bright_black](tests contain no relevant data)"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
elif element.type == IliasElementType.SURVEY:
|
||||||
|
log.status(
|
||||||
|
"[bold bright_black]",
|
||||||
|
"Ignored",
|
||||||
|
fmt_path(element_path),
|
||||||
|
"[bright_black](surveys contain no relevant data)"
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
elif element.type == IliasElementType.LINK:
|
elif element.type == IliasElementType.LINK:
|
||||||
return await self._handle_link(element, element_path)
|
return await self._handle_link(element, element_path)
|
||||||
|
@ -2,7 +2,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Awaitable, List, Optional, Pattern, Set, Union
|
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
await self._stream_from_url(file.url, sink, bar)
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||||
page = await self.get_page()
|
page, url = await self.get_page()
|
||||||
elements: List[Tag] = self._find_file_links(page)
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
folder_label = self._find_folder_label(element)
|
folder_label = self._find_folder_label(element)
|
||||||
if folder_label:
|
if folder_label:
|
||||||
folder = self._extract_folder(folder_label)
|
folder = self._extract_folder(folder_label, url)
|
||||||
if folder not in items:
|
if folder not in items:
|
||||||
items.add(folder)
|
items.add(folder)
|
||||||
folder.explain()
|
folder.explain()
|
||||||
else:
|
else:
|
||||||
file = self._extract_file(element)
|
file = self._extract_file(element, url)
|
||||||
items.add(file)
|
items.add(file)
|
||||||
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
|
||||||
log.explain("Attributing it to root folder")
|
log.explain("Attributing it to root folder")
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
name = folder_tag.getText().strip()
|
name = folder_tag.getText().strip()
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="table")
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
for link in self._find_file_links(container):
|
for link in self._find_file_links(container):
|
||||||
files.append(self._extract_file(link))
|
files.append(self._extract_file(link, url))
|
||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
return None
|
return None
|
||||||
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
|
|
||||||
def _extract_file(self, link: Tag) -> KitIpdFile:
|
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(url, link)
|
||||||
name = os.path.basename(url)
|
name = os.path.basename(url)
|
||||||
return KitIpdFile(name, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||||
|
|
||||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
||||||
return urljoin(self._url, link_tag.get("href"))
|
return urljoin(url, link_tag.get("href"))
|
||||||
|
|
||||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||||
async with self.session.get(url, allow_redirects=False) as resp:
|
async with self.session.get(url, allow_redirects=False) as resp:
|
||||||
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def get_page(self) -> BeautifulSoup:
|
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
# cause issues on other pages.
|
# cause issues on other pages.
|
||||||
content = (await request.read()).decode("utf-8")
|
content = (await request.read()).decode("utf-8")
|
||||||
content = re.sub(r"<!--.*?-->", "", content)
|
content = re.sub(r"<!--.*?-->", "", content)
|
||||||
return soupify(content.encode("utf-8"))
|
return soupify(content.encode("utf-8")), str(request.url)
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
NAME = "PFERD"
|
NAME = "PFERD"
|
||||||
VERSION = "3.4.1"
|
VERSION = "3.4.3"
|
||||||
|
@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
|
|||||||
|
|
||||||
Unofficial packages are available for:
|
Unofficial packages are available for:
|
||||||
- [AUR](https://aur.archlinux.org/packages/pferd)
|
- [AUR](https://aur.archlinux.org/packages/pferd)
|
||||||
|
- [brew](https://formulae.brew.sh/formula/pferd)
|
||||||
|
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
|
||||||
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
|
||||||
|
- [PyPi](https://pypi.org/project/pferd)
|
||||||
|
|
||||||
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
See also PFERD's [repology page](https://repology.org/project/pferd/versions).
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user