Compare commits

..

3 Commits

Author SHA1 Message Date
Julius Rüberg
42098dc3a5 Merge 77c1f1516c into a117126389 2023-12-09 22:34:12 -07:00
Julius Rüberg
77c1f1516c Used proper plural 2021-11-02 12:41:40 +01:00
Julius Rüberg
9e12e96d90 Added alias functionality 2021-11-02 03:42:08 +01:00
8 changed files with 70 additions and 94 deletions

View File

@@ -22,17 +22,6 @@ ambiguous situations.
## Unreleased
## 3.5.2 - 2024-04-14
### Fixed
- Crawling of personal desktop with ILIAS 8
- Crawling of empty personal desktops
## 3.5.1 - 2024-04-09
### Added
- Support for ILIAS 8
### Fixed
- Video name deduplication

View File

@@ -92,6 +92,9 @@ common to all crawlers:
load for the crawl target. (Default: `0.0`)
- `windows_paths`: Whether PFERD should find alternative names for paths that
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
- `aliases`: List of strings that are considered as an alias when invoking with
the `--crawler` or `-C` flag. If there is more than one crawl section with
the same aliases all are selected. Thereby, you can group different crawlers.
Some crawlers may also require credentials for authentication. To configure how
the crawler obtains its credentials, the `auth` option is used. It is set to the
@@ -106,6 +109,7 @@ username = foo
password = bar
[crawl:something]
aliases = [sth, some]
type = some-complex-crawler
auth = auth:example
on_conflict = no-delete

View File

@@ -95,9 +95,13 @@ class IliasPage:
@staticmethod
def is_root_page(soup: BeautifulSoup) -> bool:
if permalink := IliasPage.get_soup_permalink(soup):
return "goto.php?target=root_" in permalink
permalink = soup.find(id="current_perma_link")
if permalink is None:
return False
value = permalink.attrs.get("value")
if value is None:
return False
return "goto.php?target=root_" in value
def get_child_elements(self) -> List[IliasPageElement]:
"""
@@ -275,14 +279,16 @@ class IliasPage:
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
def _is_content_page(self) -> bool:
if link := self.get_permalink():
return "target=copa_" in link
link = self._soup.find(id="current_perma_link")
if not link:
return False
return "target=copa_" in link.get("value")
def _is_learning_module_page(self) -> bool:
if link := self.get_permalink():
return "target=pg_" in link
link = self._soup.find(id="current_perma_link")
if not link:
return False
return "target=pg_" in link.get("value")
def _contains_collapsed_future_meetings(self) -> bool:
return self._uncollapse_future_meetings_url() is not None
@@ -378,10 +384,6 @@ class IliasPage:
name = _sanitize_path_name(link.text.strip())
url = self._abs_url_from_link(link)
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
# Configure button/link does not have anything interesting
continue
type = self._find_type_from_link(name, link, url)
if not type:
_unexpected_html_warning()
@@ -511,8 +513,8 @@ class IliasPage:
modification_string = link.parent.parent.parent.select_one(
f"td.std:nth-child({index})"
).getText().strip()
if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
break
if modification_time is None:
@@ -611,7 +613,7 @@ class IliasPage:
file_listings: List[Tag] = container.findAll(
name="a",
# download links contain the given command class
attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
)
# Add each listing as a new
@@ -1078,14 +1080,6 @@ class IliasPage:
if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
return True
# Empty personal desktop has zero (0) markers. Match on the text...
if alert := soup.select_one(".alert-info"):
text = alert.getText().lower()
if "you have not yet selected any favourites" in text:
return True
if "sie haben aktuell noch keine favoriten ausgewählt" in text:
return True
# Video listing embeds do not have complete ILIAS html. Try to match them by
# their video listing table
video_table = soup.find(
@@ -1101,9 +1095,6 @@ class IliasPage:
return True
return False
def get_permalink(self) -> Optional[str]:
return IliasPage.get_soup_permalink(self._soup)
def _abs_url_from_link(self, link_tag: Tag) -> str:
"""
Create an absolute url from an <a> tag.
@@ -1116,13 +1107,6 @@ class IliasPage:
"""
return urljoin(self._page_url, relative_url)
@staticmethod
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
if not perma_link_element or not perma_link_element.get("href"):
return None
return perma_link_element.get("href")
def _unexpected_html_warning() -> None:
log.warn("Encountered unexpected HTML structure, ignoring element.")

View File

@@ -130,7 +130,6 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
raise CrawlError("Impossible return in ilias _iorepeat")
return wrapper # type: ignore
return decorator
@@ -228,7 +227,7 @@ instance's greatest bottleneck.
await self._crawl_url(root_url, expected_id=course_id)
async def _crawl_desktop(self) -> None:
appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items"
appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
appendix = appendix.encode("ASCII").hex()
await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)
@@ -254,8 +253,8 @@ instance's greatest bottleneck.
soup = await self._get_page(next_stage_url, root_page_allowed=True)
if current_parent is None and expected_id is not None:
perma_link = IliasPage.get_soup_permalink(soup)
if not perma_link or "crs_" not in perma_link:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
@@ -675,28 +674,12 @@ instance's greatest bottleneck.
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
async def try_stream() -> bool:
next_url = url
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
# we can not match on the content type here. Instead, we disallow redirects and inspect the
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
# our authentication expired.
async with self.session.get(url, allow_redirects=is_video) as resp:
if not is_video:
async with self.session.get(url, allow_redirects=False) as resp:
# Redirect to anything except a "sendfile" means we weren't authenticated
# Redirect means we weren't authenticated
if hdrs.LOCATION in resp.headers:
if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
return False
# Directly follow the redirect to not make a second, unnecessary request
next_url = resp.headers[hdrs.LOCATION]
# Let's try this again and follow redirects
return await fetch_follow_redirects(next_url)
async def fetch_follow_redirects(file_url: str) -> bool:
async with self.session.get(file_url) as resp:
# We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
# solve that depending on the setup, but it is better than nothing.
# we wanted a video but got HTML
if is_video and "html" in resp.content_type:
return False

View File

@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Set
from rich.markup import escape
@@ -43,16 +43,24 @@ class Pferd:
crawl_sections = [name for name, _ in config.crawl_sections()]
crawlers_to_run = [] # With crawl: prefix
crawlers_to_run = set() # With crawl: prefix
unknown_names = [] # Without crawl: prefix
for name in cli_crawlers:
section_name = f"crawl:{name}"
if section_name in crawl_sections:
log.explain(f"Crawler section named {section_name!r} exists")
crawlers_to_run.append(section_name)
else:
log.explain(f"There's no crawler section named {section_name!r}")
crawlers_to_run.add(section_name)
# interprete name as alias of a crawler
alias_names = self._find_crawlers_by_alias(name, config)
if alias_names:
crawlers_to_run.update(alias_names)
log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
for alias_name in alias_names:
log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
if not section_name in crawl_sections and not alias_names:
log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
unknown_names.append(name)
if unknown_names:
@@ -65,6 +73,14 @@ class Pferd:
return crawlers_to_run
def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
alias_names = set()
for (section_name, section) in config.crawl_sections():
section_aliases = section.get("aliases", [])
if alias in section_aliases:
alias_names.add(section_name)
return alias_names
def _find_crawlers_to_run(
self,
config: Config,

View File

@@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.5.2"
VERSION = "3.5.0"

8
flake.lock generated
View File

@@ -2,16 +2,16 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1708979614,
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
"lastModified": 1694499547,
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-23.11",
"ref": "nixos-23.05",
"repo": "nixpkgs",
"type": "github"
}

View File

@@ -2,7 +2,7 @@
description = "Tool for downloading course-related files from ILIAS";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05";
};
outputs = { self, nixpkgs }: