mirror of https://github.com/Garmelon/PFERD.git
Compare commits
4 Commits
b54b3b979c
...
50b50513c6
Author | SHA1 | Date |
---|---|---|
I-Al-Istannen | 50b50513c6 | |
I-Al-Istannen | df3514cd03 | |
I-Al-Istannen | ad53185247 | |
I-Al-Istannen | 87b67e9271 |
|
@ -28,6 +28,9 @@ ambiguous situations.
|
||||||
- Crawling of button cards without descriptions
|
- Crawling of button cards without descriptions
|
||||||
- Abort crawling when encountering an unexpected ilias root page redirect
|
- Abort crawling when encountering an unexpected ilias root page redirect
|
||||||
- Remove size suffix for files in content pages
|
- Remove size suffix for files in content pages
|
||||||
|
- Sanitize ascii control characters on Windows
|
||||||
|
- Crawling of paginated past meetings
|
||||||
|
- Ignore SCORM learning modules
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- `no-delete-prompt-override` conflict resolution strategy
|
- `no-delete-prompt-override` conflict resolution strategy
|
||||||
|
@ -36,6 +39,7 @@ ambiguous situations.
|
||||||
message. This combines nicely with the `no-delete-prompt-override` strategy,
|
message. This combines nicely with the `no-delete-prompt-override` strategy,
|
||||||
causing PFERD to mostly ignore local-only files.
|
causing PFERD to mostly ignore local-only files.
|
||||||
- support for mediacast video listings
|
- support for mediacast video listings
|
||||||
|
- crawling of files in info tab
|
||||||
|
|
||||||
## 3.4.3 - 2022-11-29
|
## 3.4.3 - 2022-11-29
|
||||||
|
|
||||||
|
|
|
@ -22,10 +22,12 @@ class IliasElementType(Enum):
|
||||||
FOLDER = "folder"
|
FOLDER = "folder"
|
||||||
FORUM = "forum"
|
FORUM = "forum"
|
||||||
LINK = "link"
|
LINK = "link"
|
||||||
|
INFO_TAB = "info_tab"
|
||||||
LEARNING_MODULE = "learning_module"
|
LEARNING_MODULE = "learning_module"
|
||||||
BOOKING = "booking"
|
BOOKING = "booking"
|
||||||
MEETING = "meeting"
|
MEETING = "meeting"
|
||||||
SURVEY = "survey"
|
SURVEY = "survey"
|
||||||
|
SCORM_LEARNING_MODULE = "scorm_learning_module"
|
||||||
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
|
MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder"
|
||||||
MEDIACAST_VIDEO = "mediacast_video"
|
MEDIACAST_VIDEO = "mediacast_video"
|
||||||
OPENCAST_VIDEO = "opencast_video"
|
OPENCAST_VIDEO = "opencast_video"
|
||||||
|
@ -120,9 +122,25 @@ class IliasPage:
|
||||||
if self._is_content_page():
|
if self._is_content_page():
|
||||||
log.explain("Page is a content page, searching for elements")
|
log.explain("Page is a content page, searching for elements")
|
||||||
return self._find_copa_entries()
|
return self._find_copa_entries()
|
||||||
|
if self._is_info_tab():
|
||||||
|
log.explain("Page is info tab, searching for elements")
|
||||||
|
return self._find_info_tab_entries()
|
||||||
log.explain("Page is a normal folder, searching for elements")
|
log.explain("Page is a normal folder, searching for elements")
|
||||||
return self._find_normal_entries()
|
return self._find_normal_entries()
|
||||||
|
|
||||||
|
def get_info_tab(self) -> Optional[IliasPageElement]:
|
||||||
|
tab: Optional[Tag] = self._soup.find(
|
||||||
|
name="a",
|
||||||
|
attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x}
|
||||||
|
)
|
||||||
|
if tab is not None:
|
||||||
|
return IliasPageElement(
|
||||||
|
IliasElementType.INFO_TAB,
|
||||||
|
self._abs_url_from_link(tab),
|
||||||
|
"infos"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
def get_description(self) -> Optional[BeautifulSoup]:
|
def get_description(self) -> Optional[BeautifulSoup]:
|
||||||
def is_interesting_class(name: str) -> bool:
|
def is_interesting_class(name: str) -> bool:
|
||||||
return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
|
return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"]
|
||||||
|
@ -209,7 +227,11 @@ class IliasPage:
|
||||||
log.explain("Requesting *all* future meetings")
|
log.explain("Requesting *all* future meetings")
|
||||||
return self._uncollapse_future_meetings_url()
|
return self._uncollapse_future_meetings_url()
|
||||||
if not self._is_content_tab_selected():
|
if not self._is_content_tab_selected():
|
||||||
return self._select_content_page_url()
|
if self._page_type != IliasElementType.INFO_TAB:
|
||||||
|
log.explain("Selecting content tab")
|
||||||
|
return self._select_content_page_url()
|
||||||
|
else:
|
||||||
|
log.explain("Crawling info tab, skipping content select")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _is_forum_page(self) -> bool:
|
def _is_forum_page(self) -> bool:
|
||||||
|
@ -272,7 +294,10 @@ class IliasPage:
|
||||||
return self._uncollapse_future_meetings_url() is not None
|
return self._uncollapse_future_meetings_url() is not None
|
||||||
|
|
||||||
def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
|
def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
|
||||||
element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x})
|
element = self._soup.find(
|
||||||
|
"a",
|
||||||
|
attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)}
|
||||||
|
)
|
||||||
if not element:
|
if not element:
|
||||||
return None
|
return None
|
||||||
link = self._abs_url_from_link(element)
|
link = self._abs_url_from_link(element)
|
||||||
|
@ -281,6 +306,10 @@ class IliasPage:
|
||||||
def _is_content_tab_selected(self) -> bool:
|
def _is_content_tab_selected(self) -> bool:
|
||||||
return self._select_content_page_url() is None
|
return self._select_content_page_url() is None
|
||||||
|
|
||||||
|
def _is_info_tab(self) -> bool:
|
||||||
|
might_be_info = self._soup.find("form", attrs={"name": lambda x: x == "formInfoScreen"}) is not None
|
||||||
|
return self._page_type == IliasElementType.INFO_TAB and might_be_info
|
||||||
|
|
||||||
def _select_content_page_url(self) -> Optional[IliasPageElement]:
|
def _select_content_page_url(self) -> Optional[IliasPageElement]:
|
||||||
tab = self._soup.find(
|
tab = self._soup.find(
|
||||||
id="tab_view_content",
|
id="tab_view_content",
|
||||||
|
@ -389,6 +418,23 @@ class IliasPage:
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
def _find_info_tab_entries(self) -> List[IliasPageElement]:
|
||||||
|
items = []
|
||||||
|
links: List[Tag] = self._soup.select("a.il_ContainerItemCommand")
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if "cmdClass=ilobjcoursegui" not in link["href"]:
|
||||||
|
continue
|
||||||
|
if "cmd=sendfile" not in link["href"]:
|
||||||
|
continue
|
||||||
|
items.append(IliasPageElement(
|
||||||
|
IliasElementType.FILE,
|
||||||
|
self._abs_url_from_link(link),
|
||||||
|
_sanitize_path_name(link.getText())
|
||||||
|
))
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
def _find_opencast_video_entries(self) -> List[IliasPageElement]:
|
def _find_opencast_video_entries(self) -> List[IliasPageElement]:
|
||||||
# ILIAS has three stages for video pages
|
# ILIAS has three stages for video pages
|
||||||
# 1. The initial dummy page without any videos. This page contains the link to the listing
|
# 1. The initial dummy page without any videos. This page contains the link to the listing
|
||||||
|
@ -908,6 +954,9 @@ class IliasPage:
|
||||||
if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
|
if "baseClass=ilMediaCastHandlerGUI" in parsed_url.query:
|
||||||
return IliasElementType.MEDIACAST_VIDEO_FOLDER
|
return IliasElementType.MEDIACAST_VIDEO_FOLDER
|
||||||
|
|
||||||
|
if "baseClass=ilSAHSPresentationGUI" in parsed_url.query:
|
||||||
|
return IliasElementType.SCORM_LEARNING_MODULE
|
||||||
|
|
||||||
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
# Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
|
||||||
# try to guess it from the image.
|
# try to guess it from the image.
|
||||||
|
|
||||||
|
@ -949,7 +998,11 @@ class IliasPage:
|
||||||
if img_tag is None:
|
if img_tag is None:
|
||||||
img_tag = found_parent.select_one("img.icon")
|
img_tag = found_parent.select_one("img.icon")
|
||||||
|
|
||||||
if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}):
|
is_session_expansion_button = found_parent.find(
|
||||||
|
"a",
|
||||||
|
attrs={"href": lambda x: x and ("crs_next_sess=" in x or "crs_prev_sess=" in x)}
|
||||||
|
)
|
||||||
|
if img_tag is None and is_session_expansion_button:
|
||||||
log.explain("Found session expansion button, skipping it as it has no content")
|
log.explain("Found session expansion button, skipping it as it has no content")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -982,6 +1035,9 @@ class IliasPage:
|
||||||
if str(img_tag["src"]).endswith("icon_mcst.svg"):
|
if str(img_tag["src"]).endswith("icon_mcst.svg"):
|
||||||
return IliasElementType.MEDIACAST_VIDEO_FOLDER
|
return IliasElementType.MEDIACAST_VIDEO_FOLDER
|
||||||
|
|
||||||
|
if str(img_tag["src"]).endswith("icon_sahs.svg"):
|
||||||
|
return IliasElementType.SCORM_LEARNING_MODULE
|
||||||
|
|
||||||
return IliasElementType.FOLDER
|
return IliasElementType.FOLDER
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -85,6 +85,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([
|
||||||
IliasElementType.EXERCISE,
|
IliasElementType.EXERCISE,
|
||||||
IliasElementType.EXERCISE_FILES,
|
IliasElementType.EXERCISE_FILES,
|
||||||
IliasElementType.FOLDER,
|
IliasElementType.FOLDER,
|
||||||
|
IliasElementType.INFO_TAB,
|
||||||
IliasElementType.MEETING,
|
IliasElementType.MEETING,
|
||||||
IliasElementType.MEDIACAST_VIDEO_FOLDER,
|
IliasElementType.MEDIACAST_VIDEO_FOLDER,
|
||||||
IliasElementType.OPENCAST_VIDEO_FOLDER,
|
IliasElementType.OPENCAST_VIDEO_FOLDER,
|
||||||
|
@ -262,6 +263,8 @@ instance's greatest bottleneck.
|
||||||
next_stage_url = None
|
next_stage_url = None
|
||||||
|
|
||||||
elements.extend(page.get_child_elements())
|
elements.extend(page.get_child_elements())
|
||||||
|
if info_tab := page.get_info_tab():
|
||||||
|
elements.append(info_tab)
|
||||||
if description_string := page.get_description():
|
if description_string := page.get_description():
|
||||||
description.append(description_string)
|
description.append(description_string)
|
||||||
|
|
||||||
|
@ -400,6 +403,14 @@ instance's greatest bottleneck.
|
||||||
"[bright_black](surveys contain no relevant data)"
|
"[bright_black](surveys contain no relevant data)"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
elif element.type == IliasElementType.SCORM_LEARNING_MODULE:
|
||||||
|
log.status(
|
||||||
|
"[bold bright_black]",
|
||||||
|
"Ignored",
|
||||||
|
fmt_path(element_path),
|
||||||
|
"[bright_black](scorm learning modules are not supported)"
|
||||||
|
)
|
||||||
|
return None
|
||||||
elif element.type == IliasElementType.LEARNING_MODULE:
|
elif element.type == IliasElementType.LEARNING_MODULE:
|
||||||
return await self._handle_learning_module(element, element_path)
|
return await self._handle_learning_module(element, element_path)
|
||||||
elif element.type == IliasElementType.LINK:
|
elif element.type == IliasElementType.LINK:
|
||||||
|
@ -705,7 +716,7 @@ instance's greatest bottleneck.
|
||||||
log.explain(f"URL: {next_stage_url}")
|
log.explain(f"URL: {next_stage_url}")
|
||||||
|
|
||||||
soup = await self._get_page(next_stage_url)
|
soup = await self._get_page(next_stage_url)
|
||||||
page = IliasPage(soup, next_stage_url, None)
|
page = IliasPage(soup, next_stage_url, element)
|
||||||
|
|
||||||
if next := page.get_next_stage_element():
|
if next := page.get_next_stage_element():
|
||||||
next_stage_url = next.url
|
next_stage_url = next.url
|
||||||
|
@ -768,14 +779,14 @@ instance's greatest bottleneck.
|
||||||
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
|
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
|
||||||
log.explain(f"URL: {element.url}")
|
log.explain(f"URL: {element.url}")
|
||||||
soup = await self._get_page(element.url)
|
soup = await self._get_page(element.url)
|
||||||
page = IliasPage(soup, element.url, None)
|
page = IliasPage(soup, element.url, element)
|
||||||
if next := page.get_learning_module_data():
|
if next := page.get_learning_module_data():
|
||||||
elements.extend(await self._crawl_learning_module_direction(
|
elements.extend(await self._crawl_learning_module_direction(
|
||||||
cl.path, next.previous_url, "left"
|
cl.path, next.previous_url, "left", element
|
||||||
))
|
))
|
||||||
elements.append(next)
|
elements.append(next)
|
||||||
elements.extend(await self._crawl_learning_module_direction(
|
elements.extend(await self._crawl_learning_module_direction(
|
||||||
cl.path, next.next_url, "right"
|
cl.path, next.next_url, "right", element
|
||||||
))
|
))
|
||||||
|
|
||||||
# Reflect their natural ordering in the file names
|
# Reflect their natural ordering in the file names
|
||||||
|
@ -797,7 +808,8 @@ instance's greatest bottleneck.
|
||||||
self,
|
self,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
start_url: Optional[str],
|
start_url: Optional[str],
|
||||||
dir: Union[Literal["left"], Literal["right"]]
|
dir: Union[Literal["left"], Literal["right"]],
|
||||||
|
parent_element: IliasPageElement
|
||||||
) -> List[IliasLearningModulePage]:
|
) -> List[IliasLearningModulePage]:
|
||||||
elements: List[IliasLearningModulePage] = []
|
elements: List[IliasLearningModulePage] = []
|
||||||
|
|
||||||
|
@ -810,7 +822,7 @@ instance's greatest bottleneck.
|
||||||
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
|
log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})")
|
||||||
log.explain(f"URL: {next_element_url}")
|
log.explain(f"URL: {next_element_url}")
|
||||||
soup = await self._get_page(next_element_url)
|
soup = await self._get_page(next_element_url)
|
||||||
page = IliasPage(soup, next_element_url, None)
|
page = IliasPage(soup, next_element_url, parent_element)
|
||||||
if next := page.get_learning_module_data():
|
if next := page.get_learning_module_data():
|
||||||
elements.append(next)
|
elements.append(next)
|
||||||
if dir == "left":
|
if dir == "left":
|
||||||
|
@ -893,7 +905,7 @@ instance's greatest bottleneck.
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if self._is_logged_in(soup):
|
if self._is_logged_in(soup):
|
||||||
return self._verify_page(soup, url, root_page_allowed)
|
return self._verify_page(soup, url, root_page_allowed)
|
||||||
raise CrawlError("get_page failed even after authenticating")
|
raise CrawlError(f"get_page failed even after authenticating on {url!r}")
|
||||||
|
|
||||||
def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
||||||
if IliasPage.is_root_page(soup) and not root_page_allowed:
|
if IliasPage.is_root_page(soup) and not root_page_allowed:
|
||||||
|
|
|
@ -14,7 +14,7 @@ def name_variants(path: PurePath) -> Iterator[PurePath]:
|
||||||
|
|
||||||
|
|
||||||
class Deduplicator:
|
class Deduplicator:
|
||||||
FORBIDDEN_CHARS = '<>:"/\\|?*'
|
FORBIDDEN_CHARS = '<>:"/\\|?*' + "".join([chr(i) for i in range(0, 32)])
|
||||||
FORBIDDEN_NAMES = {
|
FORBIDDEN_NAMES = {
|
||||||
"CON", "PRN", "AUX", "NUL",
|
"CON", "PRN", "AUX", "NUL",
|
||||||
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
|
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
|
||||||
|
|
Loading…
Reference in New Issue