Compare commits

...

4 Commits

Author SHA1 Message Date
Julius Rüberg 42098dc3a5
Merge 77c1f1516c into a117126389 2023-12-09 22:34:12 -07:00
I-Al-Istannen a117126389 Fix video name deduplication 2023-12-09 23:08:42 +01:00
Julius Rüberg 77c1f1516c Used proper plural 2021-11-02 12:41:40 +01:00
Julius Rüberg 9e12e96d90 Added alias functionality 2021-11-02 03:42:08 +01:00
4 changed files with 90 additions and 60 deletions

View File

@ -22,6 +22,9 @@ ambiguous situations.
## Unreleased ## Unreleased
### Fixed
- Video name deduplication
## 3.5.0 - 2023-09-13 ## 3.5.0 - 2023-09-13
### Added ### Added

View File

@ -92,6 +92,9 @@ common to all crawlers:
load for the crawl target. (Default: `0.0`) load for the crawl target. (Default: `0.0`)
- `windows_paths`: Whether PFERD should find alternative names for paths that - `windows_paths`: Whether PFERD should find alternative names for paths that
are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) are invalid on Windows. (Default: `yes` on Windows, `no` otherwise)
- `aliases`: List of strings that are considered as an alias when invoking with
the `--crawler` or `-C` flag. If there is more than one crawl section with
the same aliases all are selected. Thereby, you can group different crawlers.
Some crawlers may also require credentials for authentication. To configure how Some crawlers may also require credentials for authentication. To configure how
the crawler obtains its credentials, the `auth` option is used. It is set to the the crawler obtains its credentials, the `auth` option is used. It is set to the
@ -106,6 +109,7 @@ username = foo
password = bar password = bar
[crawl:something] [crawl:something]
aliases = [sth, some]
type = some-complex-crawler type = some-complex-crawler
auth = auth:example auth = auth:example
on_conflict = no-delete on_conflict = no-delete

View File

@ -140,6 +140,10 @@ def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]:
return _iorepeat(1, name) return _iorepeat(1, name)
def _get_video_cache_key(element: IliasPageElement) -> str:
return f"ilias-video-cache-{element.id()}"
# Crawler control flow: # Crawler control flow:
# #
# crawl_desktop -+ # crawl_desktop -+
@ -547,8 +551,8 @@ instance's greatest bottleneck.
# Copy old mapping as it is likely still relevant # Copy old mapping as it is likely still relevant
if self.prev_report: if self.prev_report:
self.report.add_custom_value( self.report.add_custom_value(
str(element_path), _get_video_cache_key(element),
self.prev_report.get_custom_value(str(element_path)) self.prev_report.get_custom_value(_get_video_cache_key(element))
) )
# A video might contain other videos, so let's "crawl" the video first # A video might contain other videos, so let's "crawl" the video first
@ -558,58 +562,69 @@ instance's greatest bottleneck.
# to ensure backwards compatibility. # to ensure backwards compatibility.
maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS)
# If we do not want to crawl it (user filter) or we have every file # If we do not want to crawl it (user filter), we can move on
# from the cached mapping already, we can ignore this and bail if not maybe_dl:
if not maybe_dl or self._all_opencast_videos_locally_present(element_path): return None
# Mark all existing videos as known so they do not get deleted
# during cleanup. We "downloaded" them, just without actually making # If we have every file from the cached mapping already, we can ignore this and bail
# a network request as we assumed they did not change. if self._all_opencast_videos_locally_present(element, maybe_dl.path):
for video in self._previous_contained_opencast_videos(element_path): # Mark all existing videos as known to ensure they do not get deleted during cleanup.
await self.download(video) # We "downloaded" them, just without actually making a network request as we assumed
# they did not change.
contained = self._previous_contained_opencast_videos(element, maybe_dl.path)
if len(contained) > 1:
# Only do this if we threw away the original dl token,
# to not download single-stream videos twice
for video in contained:
await self.download(video)
return None return None
return self._download_opencast_video(element_path, element, maybe_dl) return self._download_opencast_video(element, maybe_dl)
def _previous_contained_opencast_videos(self, video_path: PurePath) -> List[PurePath]: def _previous_contained_opencast_videos(
self, element: IliasPageElement, element_path: PurePath
) -> List[PurePath]:
if not self.prev_report: if not self.prev_report:
return [] return []
custom_value = self.prev_report.get_custom_value(str(video_path)) custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
if not custom_value: if not custom_value:
return [] return []
names = cast(List[str], custom_value) cached_value = cast(dict[str, Any], custom_value)
folder = video_path.parent if "known_paths" not in cached_value or "own_path" not in cached_value:
return [PurePath(folder, name) for name in names] log.explain(f"'known_paths' or 'own_path' missing from cached value: {cached_value}")
return []
transformed_own_path = self._transformer.transform(element_path)
if cached_value["own_path"] != str(transformed_own_path):
log.explain(
f"own_path '{transformed_own_path}' does not match cached value: '{cached_value['own_path']}"
)
return []
return [PurePath(name) for name in cached_value["known_paths"]]
def _all_opencast_videos_locally_present(self, video_path: PurePath) -> bool: def _all_opencast_videos_locally_present(self, element: IliasPageElement, element_path: PurePath) -> bool:
if contained_videos := self._previous_contained_opencast_videos(video_path): log.explain_topic(f"Checking local cache for video {fmt_path(element_path)}")
log.explain_topic(f"Checking local cache for video {video_path.name}") if contained_videos := self._previous_contained_opencast_videos(element, element_path):
all_found_locally = True log.explain(
for video in contained_videos: f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
transformed_path = self._to_local_opencast_video_path(video) )
if transformed_path: if all(self._output_dir.resolve(path).exists() for path in contained_videos):
exists_locally = self._output_dir.resolve(transformed_path).exists() log.explain("Found all known videos locally, skipping enumeration request")
all_found_locally = all_found_locally and exists_locally
if all_found_locally:
log.explain("Found all videos locally, skipping enumeration request")
return True return True
log.explain("Missing at least one video, continuing with requests!") log.explain("Missing at least one video, continuing with requests!")
else:
log.explain("No local cache present")
return False return False
def _to_local_opencast_video_path(self, path: PurePath) -> Optional[PurePath]:
if transformed := self._transformer.transform(path):
return self._deduplicator.fixup_path(transformed)
return None
@anoncritical @anoncritical
@_iorepeat(3, "downloading video") @_iorepeat(3, "downloading video")
async def _download_opencast_video( async def _download_opencast_video(self, element: IliasPageElement, dl: DownloadToken) -> None:
self, def add_to_report(paths: list[str]) -> None:
original_path: PurePath, self.report.add_custom_value(
element: IliasPageElement, _get_video_cache_key(element),
dl: DownloadToken {"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
) -> None: )
stream_elements: List[IliasPageElement] = []
async with dl as (bar, sink): async with dl as (bar, sink):
page = IliasPage(await self._get_page(element.url), element.url, element) page = IliasPage(await self._get_page(element.url), element.url, element)
stream_elements = page.get_child_elements() stream_elements = page.get_child_elements()
@ -620,32 +635,25 @@ instance's greatest bottleneck.
log.explain(f"Using single video mode for {element.name}") log.explain(f"Using single video mode for {element.name}")
stream_element = stream_elements[0] stream_element = stream_elements[0]
transformed_path = self._to_local_opencast_video_path(original_path)
if not transformed_path:
raise CrawlError(f"Download returned a path but transform did not for {original_path}")
# We do not have a local cache yet # We do not have a local cache yet
if self._output_dir.resolve(transformed_path).exists(): await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
log.explain(f"Video for {element.name} existed locally") add_to_report([str(self._transformer.transform(dl.path))])
else:
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
self.report.add_custom_value(str(original_path), [original_path.name])
return return
contained_video_paths: List[str] = [] contained_video_paths: List[str] = []
for stream_element in stream_elements: for stream_element in stream_elements:
video_path = original_path.parent / stream_element.name video_path = dl.path.parent / stream_element.name
contained_video_paths.append(str(video_path))
maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER)
if not maybe_dl: if not maybe_dl:
continue continue
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
log.explain(f"Streaming video from real url {stream_element.url}") log.explain(f"Streaming video from real url {stream_element.url}")
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
await self._stream_from_url(stream_element.url, sink, bar, is_video=True) await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
self.report.add_custom_value(str(original_path), contained_video_paths) add_to_report(contained_video_paths)
async def _handle_file( async def _handle_file(
self, self,
@ -657,8 +665,8 @@ instance's greatest bottleneck.
return None return None
return self._download_file(element, maybe_dl) return self._download_file(element, maybe_dl)
@anoncritical
@_iorepeat(3, "downloading file") @_iorepeat(3, "downloading file")
@anoncritical
async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None:
assert dl # The function is only reached when dl is not None assert dl # The function is only reached when dl is not None
async with dl as (bar, sink): async with dl as (bar, sink):
@ -728,7 +736,6 @@ instance's greatest bottleneck.
raise CrawlWarning("Failed to extract forum data") raise CrawlWarning("Failed to extract forum data")
if download_data.empty: if download_data.empty:
log.explain("Forum had no threads") log.explain("Forum had no threads")
elements = []
return return
html = await self._post_authenticated(download_data.url, download_data.form_data) html = await self._post_authenticated(download_data.url, download_data.form_data)
elements = parse_ilias_forum_export(soupify(html)) elements = parse_ilias_forum_export(soupify(html))
@ -962,7 +969,7 @@ instance's greatest bottleneck.
# We repeat this as the login method in shibboleth doesn't handle I/O errors. # We repeat this as the login method in shibboleth doesn't handle I/O errors.
# Shibboleth is quite reliable as well, the repeat is likely not critical here. # Shibboleth is quite reliable as well, the repeat is likely not critical here.
@ _iorepeat(3, "Login", failure_is_error=True) @_iorepeat(3, "Login", failure_is_error=True)
async def _authenticate(self) -> None: async def _authenticate(self) -> None:
await self._shibboleth_login.login(self.session) await self._shibboleth_login.login(self.session)
@ -1112,7 +1119,7 @@ async def _shib_post(
async with session.get(correct_url, allow_redirects=False) as response: async with session.get(correct_url, allow_redirects=False) as response:
location = response.headers.get("location") location = response.headers.get("location")
log.explain(f"Redirected to {location!r} with status {response.status}") log.explain(f"Redirected to {location!r} with status {response.status}")
# If shib still still has a valid session, it will directly respond to the request # If shib still has a valid session, it will directly respond to the request
if location is None: if location is None:
log.explain("Shib recognized us, returning its response directly") log.explain("Shib recognized us, returning its response directly")
return soupify(await response.read()) return soupify(await response.read())

View File

@ -1,5 +1,5 @@
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional, Set
from rich.markup import escape from rich.markup import escape
@ -43,16 +43,24 @@ class Pferd:
crawl_sections = [name for name, _ in config.crawl_sections()] crawl_sections = [name for name, _ in config.crawl_sections()]
crawlers_to_run = [] # With crawl: prefix crawlers_to_run = set() # With crawl: prefix
unknown_names = [] # Without crawl: prefix unknown_names = [] # Without crawl: prefix
for name in cli_crawlers: for name in cli_crawlers:
section_name = f"crawl:{name}" section_name = f"crawl:{name}"
if section_name in crawl_sections: if section_name in crawl_sections:
log.explain(f"Crawler section named {section_name!r} exists") log.explain(f"Crawler section named {section_name!r} exists")
crawlers_to_run.append(section_name) crawlers_to_run.add(section_name)
else: # interprete name as alias of a crawler
log.explain(f"There's no crawler section named {section_name!r}") alias_names = self._find_crawlers_by_alias(name, config)
if alias_names:
crawlers_to_run.update(alias_names)
log.explain_topic(f"Crawler alias {name!r} found corresponding crawler sections:")
for alias_name in alias_names:
log.explain(f"Crawler section named {alias_name!r} with alias {name!r} exists")
if not section_name in crawl_sections and not alias_names:
log.explain(f"There's neither a crawler section named {section_name!r} nor does a crawler with alias {name!r} exist.")
unknown_names.append(name) unknown_names.append(name)
if unknown_names: if unknown_names:
@ -65,6 +73,14 @@ class Pferd:
return crawlers_to_run return crawlers_to_run
def _find_crawlers_by_alias(self, alias: str, config: Config) -> Set[str]:
alias_names = set()
for (section_name, section) in config.crawl_sections():
section_aliases = section.get("aliases", [])
if alias in section_aliases:
alias_names.add(section_name)
return alias_names
def _find_crawlers_to_run( def _find_crawlers_to_run(
self, self,
config: Config, config: Config,