Compare commits

...

12 Commits

Author SHA1 Message Date
Joscha
6d44aac278 Bump version to 3.4.3 2022-11-29 18:22:19 +01:00
c0derMo
55a2de6b88 Fix crawling English opencast 2022-11-29 18:13:56 +01:00
Joscha
c0d6d8b229 Use url after redirect for relative links 2022-11-21 18:10:45 +01:00
Joscha
635caa765d Fix typo
Thanks, burg113
2022-11-15 17:17:57 +01:00
Pavel Zwerschke
e69b55b349 Add more unofficial package managers (#66) 2022-11-04 12:18:26 +01:00
Joscha
07200bbde5 Document ilias web crawler's forums option 2022-10-31 14:12:27 +01:00
I-Al-Istannen
c020cccc64 Include found paths in "second path found" warning 2022-10-29 14:08:29 +02:00
Joscha
259cfc20cc Bump version to 3.4.2 2022-10-26 18:26:17 +02:00
Joscha
37b51a66d8 Update changelog 2022-10-26 18:22:37 +02:00
I-Al-Istannen
f47d2f11d8 Append trailing slash to kit-ipd links to ensure urljoin works as expected 2022-10-25 20:28:22 +02:00
I-Al-Istannen
1b6be6bd79 Handle content pages in cards 2022-10-24 18:37:26 +02:00
I-Al-Istannen
e1430e6298 Handle (and ignore) surveys 2022-10-24 18:37:26 +02:00
7 changed files with 66 additions and 23 deletions

View File

@@ -22,9 +22,30 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.4.3 - 2022-11-29
### Added
- Missing documentation for `forums` option
### Changed
- Clear up error message shown when multiple paths are found to an element
### Fixed ### Fixed
- Forum crawling crashing when parsing empty (= 0 messages) threads - IPD crawler unnecessarily appending trailing slashes
- Crawling opencast when ILIAS is set to English
## 3.4.2 - 2022-10-26
### Added
- Recognize and crawl content pages in cards
- Recognize and ignore surveys
### Fixed
- Forum crawling crashing when a thread has no messages at all
- Forum crawling crashing when a forum has no threads at all - Forum crawling crashing when a forum has no threads at all
- Ilias login failing in some cases
- Crawling of paginated future meetings
- IPD crawler handling of URLs without trailing slash
## 3.4.1 - 2022-08-17 ## 3.4.1 - 2022-08-17

View File

@@ -181,6 +181,7 @@ script once per day should be fine.
redirect to the actual URL. Set to a negative value to disable the automatic redirect to the actual URL. Set to a negative value to disable the automatic
redirect. (Default: `-1`) redirect. (Default: `-1`)
- `videos`: Whether to download videos. (Default: `no`) - `videos`: Whether to download videos. (Default: `no`)
- `forums`: Whether to download forum threads. (Default: `no`)
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
`20.0`) `20.0`)
@@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
Example: `foo/bar --> baz` Example: `foo/bar --> baz`
- Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
- Converts `foo/bar` into `baz` - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `bar/wargl` - Converts `foo/bar/wargl` into `baz/wargl`
Example: `foo/bar --> !` Example: `foo/bar --> !`
- Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz`

View File

@@ -24,6 +24,7 @@ class IliasElementType(Enum):
LINK = "link" LINK = "link"
BOOKING = "booking" BOOKING = "booking"
MEETING = "meeting" MEETING = "meeting"
SURVEY = "survey"
VIDEO = "video" VIDEO = "video"
VIDEO_PLAYER = "video_player" VIDEO_PLAYER = "video_player"
VIDEO_FOLDER = "video_folder" VIDEO_FOLDER = "video_folder"
@@ -133,7 +134,7 @@ class IliasPage:
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
form_data: Dict[str, Union[str, List[ſtr]]] = { form_data: Dict[str, Union[str, List[str]]] = {
"thread_ids[]": thread_ids, "thread_ids[]": thread_ids,
"selected_cmd2": "html", "selected_cmd2": "html",
"select_cmd2": "Ausführen", "select_cmd2": "Ausführen",
@@ -365,7 +366,7 @@ class IliasPage:
""" """
# Video start links are marked with an "Abspielen" link # Video start links are marked with an "Abspielen" link
video_links: List[Tag] = self._soup.findAll( video_links: List[Tag] = self._soup.findAll(
name="a", text=re.compile(r"\s*Abspielen\s*") name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
) )
results: List[IliasPageElement] = [] results: List[IliasPageElement] = []
@@ -730,6 +731,10 @@ class IliasPage:
return IliasElementType.TEST return IliasElementType.TEST
if "fold" in icon["class"]: if "fold" in icon["class"]:
return IliasElementType.FOLDER return IliasElementType.FOLDER
if "copa" in icon["class"]:
return IliasElementType.FOLDER
if "svy" in icon["class"]:
return IliasElementType.SURVEY
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")

View File

@@ -194,7 +194,7 @@ instance's greatest bottleneck.
self._links = section.links() self._links = section.links()
self._videos = section.videos() self._videos = section.videos()
self._forums = section.forums() self._forums = section.forums()
self._visited_urls: Set[str] = set() self._visited_urls: Dict[str, PurePath] = dict()
async def _run(self) -> None: async def _run(self) -> None:
if isinstance(self._target, int): if isinstance(self._target, int):
@@ -348,9 +348,11 @@ instance's greatest bottleneck.
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
if element.url in self._visited_urls: if element.url in self._visited_urls:
raise CrawlWarning( raise CrawlWarning(
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" f"Found second path to element {element.name!r} at {element.url!r}. "
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
+ f"Second path: {fmt_path(parent_path)}."
) )
self._visited_urls.add(element.url) self._visited_urls[element.url] = parent_path
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
@@ -377,9 +379,20 @@ instance's greatest bottleneck.
return None return None
return await self._handle_forum(element, element_path) return await self._handle_forum(element, element_path)
elif element.type == IliasElementType.TEST: elif element.type == IliasElementType.TEST:
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.status(
log.explain("Tests contain no relevant files") "[bold bright_black]",
log.explain("Answer: No") "Ignored",
fmt_path(element_path),
"[bright_black](tests contain no relevant data)"
)
return None
elif element.type == IliasElementType.SURVEY:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](surveys contain no relevant data)"
)
return None return None
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path) return await self._handle_link(element, element_path)

View File

@@ -2,7 +2,7 @@ import os
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from typing import Awaitable, List, Optional, Pattern, Set, Union from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
await self._stream_from_url(file.url, sink, bar) await self._stream_from_url(file.url, sink, bar)
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
page = await self.get_page() page, url = await self.get_page()
elements: List[Tag] = self._find_file_links(page) elements: List[Tag] = self._find_file_links(page)
items: Set[Union[KitIpdFile, KitIpdFolder]] = set() items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
for element in elements: for element in elements:
folder_label = self._find_folder_label(element) folder_label = self._find_folder_label(element)
if folder_label: if folder_label:
folder = self._extract_folder(folder_label) folder = self._extract_folder(folder_label, url)
if folder not in items: if folder not in items:
items.add(folder) items.add(folder)
folder.explain() folder.explain()
else: else:
file = self._extract_file(element) file = self._extract_file(element, url)
items.add(file) items.add(file)
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
log.explain("Attributing it to root folder") log.explain("Attributing it to root folder")
return items return items
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
files: List[KitIpdFile] = [] files: List[KitIpdFile] = []
name = folder_tag.getText().strip() name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table") container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container): for link in self._find_file_links(container):
files.append(self._extract_file(link)) files.append(self._extract_file(link, url))
return KitIpdFolder(name, files) return KitIpdFolder(name, files)
@@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
return None return None
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
def _extract_file(self, link: Tag) -> KitIpdFile: def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
url = self._abs_url_from_link(link) url = self._abs_url_from_link(url, link)
name = os.path.basename(url) name = os.path.basename(url)
return KitIpdFile(name, url) return KitIpdFile(name, url)
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex}) return tag.findAll(name="a", attrs={"href": self._file_regex})
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href")) return urljoin(url, link_tag.get("href"))
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp: async with self.session.get(url, allow_redirects=False) as resp:
@@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
sink.done() sink.done()
async def get_page(self) -> BeautifulSoup: async def get_page(self) -> Tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request: async with self.session.get(self._url) as request:
# The web page for Algorithmen für Routenplanung contains some # The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This # weird comments that beautifulsoup doesn't parse correctly. This
@@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
# cause issues on other pages. # cause issues on other pages.
content = (await request.read()).decode("utf-8") content = (await request.read()).decode("utf-8")
content = re.sub(r"<!--.*?-->", "", content) content = re.sub(r"<!--.*?-->", "", content)
return soupify(content.encode("utf-8")) return soupify(content.encode("utf-8")), str(request.url)

View File

@@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.4.1" VERSION = "3.4.3"

View File

@@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
Unofficial packages are available for: Unofficial packages are available for:
- [AUR](https://aur.archlinux.org/packages/pferd) - [AUR](https://aur.archlinux.org/packages/pferd)
- [brew](https://formulae.brew.sh/formula/pferd)
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
- [PyPi](https://pypi.org/project/pferd)
See also PFERD's [repology page](https://repology.org/project/pferd/versions). See also PFERD's [repology page](https://repology.org/project/pferd/versions).