Compare commits

..

8 Commits

8 changed files with 105 additions and 24 deletions

1
.gitignore vendored
View File

@ -3,6 +3,7 @@
/PFERD.egg-info/
__pycache__/
/.vscode/
/.idea/
# pyinstaller
/pferd.spec

View File

@ -22,20 +22,28 @@ ambiguous situations.
## Unreleased
## 3.5.0 - 2023-09-13
### Added
- `no-delete-prompt-override` conflict resolution strategy
- Support for ILIAS learning modules
- `show_not_deleted` option to stop printing the "Not Deleted" status or report
message. This combines nicely with the `no-delete-prompt-override` strategy,
causing PFERD to mostly ignore local-only files.
- Support for mediacast video listings
- Crawling of files in info tab
### Changed
- Remove size suffix for files in content pages
### Fixed
- Crawling of courses with the timeline view as the default tab
- Crawling of file and custom opencast cards
- Crawling of button cards without descriptions
- Abort crawling when encountering an unexpected ilias root page redirect
- Remove size suffix for files in content pages
### Added
- `no-delete-prompt-override` conflict resolution strategy
- support for ILIAS learning modules
- `show_not_deleted` option to stop printing the "Not Deleted" status or report
message. This combines nicely with the `no-delete-prompt-override` strategy,
causing PFERD to mostly ignore local-only files.
- support for mediacast video listings
- Sanitize ascii control characters on Windows
- Crawling of paginated past meetings
- Ignore SCORM learning modules
## 3.4.3 - 2022-11-29

View File

@ -22,10 +22,12 @@ class IliasElementType(Enum):
FOLDER = "folder"
FORUM = "forum"
LINK = "link"
INFO_TAB = "info_tab"
LEARNING_MODULE = "learning_module"
BOOKING = "booking"
MEETING = "meeting"
SURVEY = "survey"
SCORM_LEARNING_MODULE = "scorm_learning_module"
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
MEDIACAST_VIDEO = "mediacast_video"
OPENCAST_VIDEO = "opencast_video"
@ -120,9 +122,25 @@ class IliasPage:
if self._is_content_page():
log.explain("Page is a content page, searching for elements")
return self._find_copa_entries()
if self._is_info_tab():
log.explain("Page is info tab, searching for elements")
return self._find_info_tab_entries()
log.explain("Page is a normal folder, searching for elements")
return self._find_normal_entries()
def get_info_tab(self) -> Optional[IliasPageElement]:
tab: Optional[Tag] = self._soup.find(
name="a",
attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x}
)
if tab is not None:
return IliasPageElement(
IliasElementType.INFO_TAB,
self._abs_url_from_link(tab),
"infos"
)
return None
def get_description(self) -> Optional[BeautifulSoup]:
def is_interesting_class(name: str) -> bool:
return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
@ -209,7 +227,11 @@ class IliasPage:
log.explain("Requesting *all* future meetings")
return self._uncollapse_future_meetings_url()
if not self._is_content_tab_selected():
return self._select_content_page_url()
if self._page_type != IliasElementType.INFO_TAB:
log.explain("Selecting content tab")
return self._select_content_page_url()
else:
log.explain("Crawling info tab, skipping content select")
return None
def _is_forum_page(self) -> bool:
@ -272,7 +294,10 @@ class IliasPage:
return self._uncollapse_future_meetings_url() is not None
def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x})
element = self._soup.find(
"a",
attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)}
)
if not element:
return None
link = self._abs_url_from_link(element)
@ -281,6 +306,10 @@ class IliasPage:
def _is_content_tab_selected(self) -> bool:
return self._select_content_page_url() is None
def _is_info_tab(self) -> bool:
might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None
return self._page_type == IliasElementType.INFO_TAB and might_be_info
def _select_content_page_url(self) -> Optional[IliasPageElement]:
tab = self._soup.find(
id="tab_view_content",
@ -389,6 +418,23 @@ class IliasPage:
return items
def _find_info_tab_entries(self) -> List[IliasPageElement]:
items = []
links: List[Tag] = self._soup.select("a.il_ContainerItemCommand")
for link in links:
if "cmdClass=ilobjcoursegui" not in link["href"]:
continue
if "cmd=sendfile" not in link["href"]:
continue
items.append(IliasPageElement(
IliasElementType.FILE,
self._abs_url_from_link(link),
_sanitize_path_name(link.getText())
))
return items
def _find_opencast_video_entries(self) -> List[IliasPageElement]:
# ILIAS has three stages for video pages
# 1. The initial dummy page without any videos. This page contains the link to the listing
@ -468,8 +514,11 @@ class IliasPage:
f"td.std:nth-child({index})"
).getText().strip()
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
log.explain(f"Converting {modification_string!r}")
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
break
else:
log.explain(f"Date has wrong format: {modification_string!r}")
if modification_time is None:
log.warn(f"Could not determine upload time for {link}")
@ -908,6 +957,9 @@ class IliasPage:
if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
return IliasElementType.MEDIACAST_VIDEO_FOLDER
if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
return IliasElementType.SCORM_LEARNING_MODULE
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
# try to guess it from the image.
@ -949,7 +1001,11 @@ class IliasPage:
if img_tag is None:
img_tag = found_parent.select_one("img.icon")
if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}):
is_session_expansion_button = found_parent.find(
"a",
attrs={"href": lambda x: x and ("crs_next_sess=" in x or "crs_prev_sess=" in x)}
)
if img_tag is None and is_session_expansion_button:
log.explain("Found session expansion button, skipping it as it has no content")
return None
@ -982,6 +1038,9 @@ class IliasPage:
if str(img_tag["src"]).endswith("icon_mcst.svg"):
return IliasElementType.MEDIACAST_VIDEO_FOLDER
if str(img_tag["src"]).endswith("icon_sahs.svg"):
return IliasElementType.SCORM_LEARNING_MODULE
return IliasElementType.FOLDER
@staticmethod

View File

@ -85,6 +85,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
IliasElementType.EXERCISE,
IliasElementType.EXERCISE_FILES,
IliasElementType.FOLDER,
IliasElementType.INFO_TAB,
IliasElementType.MEETING,
IliasElementType.MEDIACAST_VIDEO_FOLDER,
IliasElementType.OPENCAST_VIDEO_FOLDER,
@ -262,6 +263,8 @@ instance's greatest bottleneck.
next_stage_url = None
elements.extend(page.get_child_elements())
if info_tab := page.get_info_tab():
elements.append(info_tab)
if description_string := page.get_description():
description.append(description_string)
@ -400,6 +403,14 @@ instance's greatest bottleneck.
"[bright_black](surveys contain no relevant data)"
)
return None
elif element.type == IliasElementType.SCORM_LEARNING_MODULE:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](scorm learning modules are not supported)"
)
return None
elif element.type == IliasElementType.LEARNING_MODULE:
return await self._handle_learning_module(element, element_path)
elif element.type == IliasElementType.LINK:
@ -705,7 +716,7 @@ instance's greatest bottleneck.
log.explain(f"URL: {next_stage_url}")
soup = await self._get_page(next_stage_url)
page = IliasPage(soup, next_stage_url, None)
page = IliasPage(soup, next_stage_url, element)
if next := page.get_next_stage_element():
next_stage_url = next.url
@ -768,14 +779,14 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {element.url}")
soup = await self._get_page(element.url)
page = IliasPage(soup, element.url, None)
page = IliasPage(soup, element.url, element)
if next := page.get_learning_module_data():
elements.extend(await self._crawl_learning_module_direction(
cl.path, next.previous_url, "left"
cl.path, next.previous_url, "left", element
))
elements.append(next)
elements.extend(await self._crawl_learning_module_direction(
cl.path, next.next_url, "right"
cl.path, next.next_url, "right", element
))
# Reflect their natural ordering in the file names
@ -797,7 +808,8 @@ instance's greatest bottleneck.
self,
path: PurePath,
start_url: Optional[str],
dir: Union[Literal["left"], Literal["right"]]
dir: Union[Literal["left"], Literal["right"]],
parent_element: IliasPageElement
) -> List[IliasLearningModulePage]:
elements: List[IliasLearningModulePage] = []
@ -810,7 +822,7 @@ instance's greatest bottleneck.
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
log.explain(f"URL: {next_element_url}")
soup = await self._get_page(next_element_url)
page = IliasPage(soup, next_element_url, None)
page = IliasPage(soup, next_element_url, parent_element)
if next := page.get_learning_module_data():
elements.append(next)
if dir == "left":
@ -893,7 +905,7 @@ instance's greatest bottleneck.
soup = soupify(await request.read())
if self._is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed)
raise CrawlError("get_page failed even after authenticating")
raise CrawlError(f"get_page failed even after authenticating on {url!r}")
def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
if IliasPage.is_root_page(soup) and not root_page_allowed:

View File

@ -14,7 +14,7 @@ def name_variants(path: PurePath) -> Iterator[PurePath]:
class Deduplicator:
FORBIDDEN_CHARS = '<>:"/\\|?*'
FORBIDDEN_CHARS = '<>:"/\\|?*' + "".join([chr(i) for i in range(0, 32)])
FORBIDDEN_NAMES = {
"CON", "PRN", "AUX", "NUL",
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",

View File

@ -415,6 +415,7 @@ class OutputDirectory:
def _update_metadata(self, info: DownloadInfo) -> None:
if mtime := info.heuristics.mtime:
log.explain(f"Setting mtime to {mtime}")
mtimestamp = mtime.timestamp()
os.utime(info.local_path, times=(mtimestamp, mtimestamp))

View File

@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.4.3"
VERSION = "3.5.0"

6
flake.lock generated
View File

@ -2,11 +2,11 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1692986144,
"narHash": "sha256-M4VFpy7Av9j+33HF5nIGm0k2+DXXW4qSSKdidIKg5jY=",
"lastModified": 1694499547,
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "74e5bdc5478ebbe7ba5849f0d765f92757bb9dbf",
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
"type": "github"
},
"original": {