Compare commits

...

12 Commits

7 changed files with 66 additions and 23 deletions

View File

@ -22,9 +22,30 @@ ambiguous situations.
## Unreleased ## Unreleased
## 3.4.3 - 2022-11-29
### Added
- Missing documentation for `forums` option
### Changed
- Clear up error message shown when multiple paths are found to an element
### Fixed ### Fixed
- Forum crawling crashing when parsing empty (= 0 messages) threads - IPD crawler unnecessarily appending trailing slashes
- Crawling opencast when ILIAS is set to English
## 3.4.2 - 2022-10-26
### Added
- Recognize and crawl content pages in cards
- Recognize and ignore surveys
### Fixed
- Forum crawling crashing when a thread has no messages at all
- Forum crawling crashing when a forum has no threads at all - Forum crawling crashing when a forum has no threads at all
- Ilias login failing in some cases
- Crawling of paginated future meetings
- IPD crawler handling of URLs without trailing slash
## 3.4.1 - 2022-08-17 ## 3.4.1 - 2022-08-17

View File

@ -181,6 +181,7 @@ script once per day should be fine.
redirect to the actual URL. Set to a negative value to disable the automatic redirect to the actual URL. Set to a negative value to disable the automatic
redirect. (Default: `-1`) redirect. (Default: `-1`)
- `videos`: Whether to download videos. (Default: `no`) - `videos`: Whether to download videos. (Default: `no`)
- `forums`: Whether to download forum threads. (Default: `no`)
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
`20.0`) `20.0`)
@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
Example: `foo/bar --> baz` Example: `foo/bar --> baz`
- Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
- Converts `foo/bar` into `baz` - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `bar/wargl` - Converts `foo/bar/wargl` into `baz/wargl`
Example: `foo/bar --> !` Example: `foo/bar --> !`
- Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz`

View File

@ -24,6 +24,7 @@ class IliasElementType(Enum):
LINK = "link" LINK = "link"
BOOKING = "booking" BOOKING = "booking"
MEETING = "meeting" MEETING = "meeting"
SURVEY = "survey"
VIDEO = "video" VIDEO = "video"
VIDEO_PLAYER = "video_player" VIDEO_PLAYER = "video_player"
VIDEO_FOLDER = "video_folder" VIDEO_FOLDER = "video_folder"
@ -133,7 +134,7 @@ class IliasPage:
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
form_data: Dict[str, Union[str, List[ſtr]]] = { form_data: Dict[str, Union[str, List[str]]] = {
"thread_ids[]": thread_ids, "thread_ids[]": thread_ids,
"selected_cmd2": "html", "selected_cmd2": "html",
"select_cmd2": "Ausführen", "select_cmd2": "Ausführen",
@ -365,7 +366,7 @@ class IliasPage:
""" """
# Video start links are marked with an "Abspielen" link # Video start links are marked with an "Abspielen" link
video_links: List[Tag] = self._soup.findAll( video_links: List[Tag] = self._soup.findAll(
name="a", text=re.compile(r"\s*Abspielen\s*") name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
) )
results: List[IliasPageElement] = [] results: List[IliasPageElement] = []
@ -730,6 +731,10 @@ class IliasPage:
return IliasElementType.TEST return IliasElementType.TEST
if "fold" in icon["class"]: if "fold" in icon["class"]:
return IliasElementType.FOLDER return IliasElementType.FOLDER
if "copa" in icon["class"]:
return IliasElementType.FOLDER
if "svy" in icon["class"]:
return IliasElementType.SURVEY
_unexpected_html_warning() _unexpected_html_warning()
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")

View File

@ -194,7 +194,7 @@ instance's greatest bottleneck.
self._links = section.links() self._links = section.links()
self._videos = section.videos() self._videos = section.videos()
self._forums = section.forums() self._forums = section.forums()
self._visited_urls: Set[str] = set() self._visited_urls: Dict[str, PurePath] = dict()
async def _run(self) -> None: async def _run(self) -> None:
if isinstance(self._target, int): if isinstance(self._target, int):
@ -348,9 +348,11 @@ instance's greatest bottleneck.
) -> Optional[Coroutine[Any, Any, None]]: ) -> Optional[Coroutine[Any, Any, None]]:
if element.url in self._visited_urls: if element.url in self._visited_urls:
raise CrawlWarning( raise CrawlWarning(
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" f"Found second path to element {element.name!r} at {element.url!r}. "
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
+ f"Second path: {fmt_path(parent_path)}."
) )
self._visited_urls.add(element.url) self._visited_urls[element.url] = parent_path
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
@ -377,9 +379,20 @@ instance's greatest bottleneck.
return None return None
return await self._handle_forum(element, element_path) return await self._handle_forum(element, element_path)
elif element.type == IliasElementType.TEST: elif element.type == IliasElementType.TEST:
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.status(
log.explain("Tests contain no relevant files") "[bold bright_black]",
log.explain("Answer: No") "Ignored",
fmt_path(element_path),
"[bright_black](tests contain no relevant data)"
)
return None
elif element.type == IliasElementType.SURVEY:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](surveys contain no relevant data)"
)
return None return None
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:
return await self._handle_link(element, element_path) return await self._handle_link(element, element_path)

View File

@ -2,7 +2,7 @@ import os
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from typing import Awaitable, List, Optional, Pattern, Set, Union from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -99,32 +99,32 @@ class KitIpdCrawler(HttpCrawler):
await self._stream_from_url(file.url, sink, bar) await self._stream_from_url(file.url, sink, bar)
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
page = await self.get_page() page, url = await self.get_page()
elements: List[Tag] = self._find_file_links(page) elements: List[Tag] = self._find_file_links(page)
items: Set[Union[KitIpdFile, KitIpdFolder]] = set() items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
for element in elements: for element in elements:
folder_label = self._find_folder_label(element) folder_label = self._find_folder_label(element)
if folder_label: if folder_label:
folder = self._extract_folder(folder_label) folder = self._extract_folder(folder_label, url)
if folder not in items: if folder not in items:
items.add(folder) items.add(folder)
folder.explain() folder.explain()
else: else:
file = self._extract_file(element) file = self._extract_file(element, url)
items.add(file) items.add(file)
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
log.explain("Attributing it to root folder") log.explain("Attributing it to root folder")
return items return items
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
files: List[KitIpdFile] = [] files: List[KitIpdFile] = []
name = folder_tag.getText().strip() name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table") container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container): for link in self._find_file_links(container):
files.append(self._extract_file(link)) files.append(self._extract_file(link, url))
return KitIpdFolder(name, files) return KitIpdFolder(name, files)
@ -135,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
return None return None
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
def _extract_file(self, link: Tag) -> KitIpdFile: def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
url = self._abs_url_from_link(link) url = self._abs_url_from_link(url, link)
name = os.path.basename(url) name = os.path.basename(url)
return KitIpdFile(name, url) return KitIpdFile(name, url)
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex}) return tag.findAll(name="a", attrs={"href": self._file_regex})
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href")) return urljoin(url, link_tag.get("href"))
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp: async with self.session.get(url, allow_redirects=False) as resp:
@ -159,7 +159,7 @@ class KitIpdCrawler(HttpCrawler):
sink.done() sink.done()
async def get_page(self) -> BeautifulSoup: async def get_page(self) -> Tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request: async with self.session.get(self._url) as request:
# The web page for Algorithmen für Routenplanung contains some # The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This # weird comments that beautifulsoup doesn't parse correctly. This
@ -167,4 +167,4 @@ class KitIpdCrawler(HttpCrawler):
# cause issues on other pages. # cause issues on other pages.
content = (await request.read()).decode("utf-8") content = (await request.read()).decode("utf-8")
content = re.sub(r"<!--.*?-->", "", content) content = re.sub(r"<!--.*?-->", "", content)
return soupify(content.encode("utf-8")) return soupify(content.encode("utf-8")), str(request.url)

View File

@ -1,2 +1,2 @@
NAME = "PFERD" NAME = "PFERD"
VERSION = "3.4.1" VERSION = "3.4.3"

View File

@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
Unofficial packages are available for: Unofficial packages are available for:
- [AUR](https://aur.archlinux.org/packages/pferd) - [AUR](https://aur.archlinux.org/packages/pferd)
- [brew](https://formulae.brew.sh/formula/pferd)
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
- [PyPi](https://pypi.org/project/pferd)
See also PFERD's [repology page](https://repology.org/project/pferd/versions). See also PFERD's [repology page](https://repology.org/project/pferd/versions).