From 70ec64a48ba8a56a819dfdbacba974f108d1206e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Jun 2021 15:39:22 +0200 Subject: [PATCH] Fix wrong base URL for multi-stage pages --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6049d2..c09f921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,9 @@ path separators to `/` in your regex rules. - Use the label to the left for exercises instead of the button name to determine the folder name +### Fixed +- Video pagination handling in ILIAS crawler + ## 3.0.1 - 2021-06-01 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index db9a303..384f0de 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -480,7 +480,7 @@ class IliasPage: return None if "opencast" in str(img_tag["alt"]).lower(): - return IliasElementType.VIDEO_FOLDER + return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if str(img_tag["src"]).endswith("icon_exc.svg"): return IliasElementType.EXERCISE diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 78428e0..6495da9 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -253,7 +253,7 @@ instance's greatest bottleneck. soup = await self._get_page(next_stage_url) log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") log.explain(f"URL: {next_stage_url}") - page = IliasPage(soup, url, parent) + page = IliasPage(soup, next_stage_url, parent) next_stage_url = page.get_next_stage_url() elements.extend(page.get_child_elements())