mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-07-15 15:32:36 +02:00
Compare commits
8 Commits
42098dc3a5
...
v3.5.2
Author | SHA1 | Date | |
---|---|---|---|
![]() |
eb01aa86cb | ||
![]() |
3db186a978 | ||
![]() |
4a5959fd58 | ||
![]() |
1cbc2b717a | ||
![]() |
da627ff929 | ||
![]() |
c1b592ac29 | ||
![]() |
eb0c956d32 | ||
![]() |
ab0cb2d956 |
11
CHANGELOG.md
11
CHANGELOG.md
@@ -22,6 +22,17 @@ ambiguous situations.
|
||||
|
||||
## Unreleased
|
||||
|
||||
## 3.5.2 - 2024-04-14
|
||||
|
||||
### Fixed
|
||||
- Crawling of personal desktop with ILIAS 8
|
||||
- Crawling of empty personal desktops
|
||||
|
||||
## 3.5.1 - 2024-04-09
|
||||
|
||||
### Added
|
||||
- Support for ILIAS 8
|
||||
|
||||
### Fixed
|
||||
- Video name deduplication
|
||||
|
||||
|
@@ -92,9 +92,6 @@ common to all crawlers:
|
||||
load for the crawl target. (Default: `0.0`)
|
||||
- `windows_paths`: Whether PFERD should find alternative names for paths that
|
||||
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
|
||||
- `aliases`: List of strings that are considered as an alias when invoking with
|
||||
the `--crawler` or `-C` flag. If there is more than one crawl section with
|
||||
the same aliases all are selected. Thereby, you can group different crawlers.
|
||||
|
||||
Some crawlers may also require credentials for authentication. To configure how
|
||||
the crawler obtains its credentials, the `auth` option is used. It is set to the
|
||||
@@ -109,7 +106,6 @@ username = foo
|
||||
password = bar
|
||||
|
||||
[crawl:something]
|
||||
aliases = [sth, some]
|
||||
type = some-complex-crawler
|
||||
auth = auth:example
|
||||
on_conflict = no-delete
|
||||
|
@@ -95,13 +95,9 @@ class IliasPage:
|
||||
|
||||
@staticmethod
|
||||
def is_root_page(soup: BeautifulSoup) -> bool:
|
||||
permalink = soup.find(id="current_perma_link")
|
||||
if permalink is None:
|
||||
if permalink := IliasPage.get_soup_permalink(soup):
|
||||
return "goto.php?target=root_" in permalink
|
||||
return False
|
||||
value = permalink.attrs.get("value")
|
||||
if value is None:
|
||||
return False
|
||||
return "goto.php?target=root_" in value
|
||||
|
||||
def get_child_elements(self) -> List[IliasPageElement]:
|
||||
"""
|
||||
@@ -279,16 +275,14 @@ class IliasPage:
|
||||
return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x})
|
||||
|
||||
def _is_content_page(self) -> bool:
|
||||
link = self._soup.find(id="current_perma_link")
|
||||
if not link:
|
||||
if link := self.get_permalink():
|
||||
return "target=copa_" in link
|
||||
return False
|
||||
return "target=copa_" in link.get("value")
|
||||
|
||||
def _is_learning_module_page(self) -> bool:
|
||||
link = self._soup.find(id="current_perma_link")
|
||||
if not link:
|
||||
if link := self.get_permalink():
|
||||
return "target=pg_" in link
|
||||
return False
|
||||
return "target=pg_" in link.get("value")
|
||||
|
||||
def _contains_collapsed_future_meetings(self) -> bool:
|
||||
return self._uncollapse_future_meetings_url() is not None
|
||||
@@ -384,6 +378,10 @@ class IliasPage:
|
||||
name = _sanitize_path_name(link.text.strip())
|
||||
url = self._abs_url_from_link(link)
|
||||
|
||||
if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url:
|
||||
# Configure button/link does not have anything interesting
|
||||
continue
|
||||
|
||||
type = self._find_type_from_link(name, link, url)
|
||||
if not type:
|
||||
_unexpected_html_warning()
|
||||
@@ -513,8 +511,8 @@ class IliasPage:
|
||||
modification_string = link.parent.parent.parent.select_one(
|
||||
f"td.std:nth-child({index})"
|
||||
).getText().strip()
|
||||
if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
|
||||
modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
|
||||
if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string):
|
||||
modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M")
|
||||
break
|
||||
|
||||
if modification_time is None:
|
||||
@@ -613,7 +611,7 @@ class IliasPage:
|
||||
file_listings: List[Tag] = container.findAll(
|
||||
name="a",
|
||||
# download links contain the given command class
|
||||
attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x}
|
||||
attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()}
|
||||
)
|
||||
|
||||
# Add each listing as a new
|
||||
@@ -1080,6 +1078,14 @@ class IliasPage:
|
||||
if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}):
|
||||
return True
|
||||
|
||||
# Empty personal desktop has zero (0) markers. Match on the text...
|
||||
if alert := soup.select_one(".alert-info"):
|
||||
text = alert.getText().lower()
|
||||
if "you have not yet selected any favourites" in text:
|
||||
return True
|
||||
if "sie haben aktuell noch keine favoriten ausgewählt" in text:
|
||||
return True
|
||||
|
||||
# Video listing embeds do not have complete ILIAS html. Try to match them by
|
||||
# their video listing table
|
||||
video_table = soup.find(
|
||||
@@ -1095,6 +1101,9 @@ class IliasPage:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_permalink(self) -> Optional[str]:
|
||||
return IliasPage.get_soup_permalink(self._soup)
|
||||
|
||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||
"""
|
||||
Create an absolute url from an <a> tag.
|
||||
@@ -1107,6 +1116,13 @@ class IliasPage:
|
||||
"""
|
||||
return urljoin(self._page_url, relative_url)
|
||||
|
||||
@staticmethod
|
||||
def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]:
|
||||
perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a")
|
||||
if not perma_link_element or not perma_link_element.get("href"):
|
||||
return None
|
||||
return perma_link_element.get("href")
|
||||
|
||||
|
||||
def _unexpected_html_warning() -> None:
|
||||
log.warn("Encountered unexpected HTML structure, ignoring element.")
|
||||
|
@@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
|
||||
raise CrawlError("Impossible return in ilias _iorepeat")
|
||||
|
||||
return wrapper # type: ignore
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@@ -227,7 +228,7 @@ instance's greatest bottleneck.
|
||||
await self._crawl_url(root_url, expected_id=course_id)
|
||||
|
||||
async def _crawl_desktop(self) -> None:
|
||||
appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items"
|
||||
appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items"
|
||||
appendix = appendix.encode("ASCII").hex()
|
||||
await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix)
|
||||
|
||||
@@ -253,8 +254,8 @@ instance's greatest bottleneck.
|
||||
soup = await self._get_page(next_stage_url, root_page_allowed=True)
|
||||
|
||||
if current_parent is None and expected_id is not None:
|
||||
perma_link_element: Tag = soup.find(id="current_perma_link")
|
||||
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
|
||||
perma_link = IliasPage.get_soup_permalink(soup)
|
||||
if not perma_link or "crs_" not in perma_link:
|
||||
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
|
||||
|
||||
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
|
||||
@@ -674,12 +675,28 @@ instance's greatest bottleneck.
|
||||
|
||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
|
||||
async def try_stream() -> bool:
|
||||
async with self.session.get(url, allow_redirects=is_video) as resp:
|
||||
next_url = url
|
||||
|
||||
# Normal files redirect to the magazine if we are not authenticated. As files could be HTML,
|
||||
# we can not match on the content type here. Instead, we disallow redirects and inspect the
|
||||
# new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume
|
||||
# our authentication expired.
|
||||
if not is_video:
|
||||
# Redirect means we weren't authenticated
|
||||
async with self.session.get(url, allow_redirects=False) as resp:
|
||||
# Redirect to anything except a "sendfile" means we weren't authenticated
|
||||
if hdrs.LOCATION in resp.headers:
|
||||
if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]:
|
||||
return False
|
||||
# we wanted a video but got HTML
|
||||
# Directly follow the redirect to not make a second, unnecessary request
|
||||
next_url = resp.headers[hdrs.LOCATION]
|
||||
|
||||
# Let's try this again and follow redirects
|
||||
return await fetch_follow_redirects(next_url)
|
||||
|
||||
async def fetch_follow_redirects(file_url: str) -> bool:
|
||||
async with self.session.get(file_url) as resp:
|
||||
# We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really
|
||||
# solve that depending on the setup, but it is better than nothing.
|
||||
if is_video and "html" in resp.content_type:
|
||||
return False
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from rich.markup import escape
|
||||
|
||||
@@ -43,24 +43,16 @@ class Pferd:
|
||||
|
||||
crawl_sections = [name for name, _ in config.crawl_sections()]
|
||||
|
||||
crawlers_to_run = set() # With crawl: prefix
|
||||
crawlers_to_run = [] # With crawl: prefix
|
||||
unknown_names = [] # Without crawl: prefix
|
||||
|
||||
for name in cli_crawlers:
|
||||
section_name = f"crawl:{name}"
|
||||
if section_name in crawl_sections:
|
||||
log.explain(f"Crawler section named {section_name!r} exists")
|
||||
crawlers_to_run.add(section_name)
|
||||
# interprete name as alias of a crawler
|
||||
alias_names = self._find_crawlers_by_alias(name, config)
|
||||
if alias_names:
|
||||
crawlers_to_run.update(alias_names)
|
||||
log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
|
||||
for alias_name in alias_names:
|
||||
log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
|
||||
|
||||
if not section_name in crawl_sections and not alias_names:
|
||||
log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
|
||||
crawlers_to_run.append(section_name)
|
||||
else:
|
||||
log.explain(f"There's no crawler section named {section_name!r}")
|
||||
unknown_names.append(name)
|
||||
|
||||
if unknown_names:
|
||||
@@ -73,14 +65,6 @@ class Pferd:
|
||||
|
||||
return crawlers_to_run
|
||||
|
||||
def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
|
||||
alias_names = set()
|
||||
for (section_name, section) in config.crawl_sections():
|
||||
section_aliases = section.get("aliases", [])
|
||||
if alias in section_aliases:
|
||||
alias_names.add(section_name)
|
||||
return alias_names
|
||||
|
||||
def _find_crawlers_to_run(
|
||||
self,
|
||||
config: Config,
|
||||
|
@@ -1,2 +1,2 @@
|
||||
NAME = "PFERD"
|
||||
VERSION = "3.5.0"
|
||||
VERSION = "3.5.2"
|
||||
|
8
flake.lock
generated
8
flake.lock
generated
@@ -2,16 +2,16 @@
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1694499547,
|
||||
"narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=",
|
||||
"lastModified": 1708979614,
|
||||
"narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "e5f018cf150e29aac26c61dac0790ea023c46b24",
|
||||
"rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-23.05",
|
||||
"ref": "nixos-23.11",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
|
Reference in New Issue
Block a user