mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Correctly crawl video pages with multiple pages
This commit is contained in:
parent
cfe4a8fc0a
commit
c0056e5669
@ -272,6 +272,71 @@ class IliasCrawler:
|
|||||||
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If we find a page selected, we probably need to respect pagination
|
||||||
|
if self._is_paginated_video_page(video_list_soup):
|
||||||
|
second_stage_url = self._abs_url_from_link(content_link)
|
||||||
|
|
||||||
|
return self._crawl_paginated_video_directory(
|
||||||
|
video_dir_path, video_list_soup, second_stage_url
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._crawl_video_directory_second_stage(video_dir_path, video_list_soup)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool:
|
||||||
|
return soup.find(id=re.compile(r"tab_page_sel.+")) is not None
|
||||||
|
|
||||||
|
def _crawl_paginated_video_directory(
|
||||||
|
self,
|
||||||
|
video_dir_path: Path,
|
||||||
|
paged_video_list_soup: bs4.BeautifulSoup,
|
||||||
|
second_stage_url: str
|
||||||
|
) -> List[IliasDownloadInfo]:
|
||||||
|
LOGGER.info("Found paginated video page, trying 800 elements")
|
||||||
|
|
||||||
|
# Try to find the table id. This can be used to build the query parameter indicating
|
||||||
|
# you want 800 elements
|
||||||
|
|
||||||
|
table_element: bs4.Tag = paged_video_list_soup.find(
|
||||||
|
name="table", id=re.compile(r"tbl_xoct_.+")
|
||||||
|
)
|
||||||
|
if table_element is None:
|
||||||
|
PRETTY.warning(
|
||||||
|
"Could not increase elements per page (table not found)."
|
||||||
|
" Some might not be crawled!"
|
||||||
|
)
|
||||||
|
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
|
||||||
|
|
||||||
|
match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"])
|
||||||
|
if match is None:
|
||||||
|
PRETTY.warning(
|
||||||
|
"Could not increase elements per page (table id not found)."
|
||||||
|
" Some might not be crawled!"
|
||||||
|
)
|
||||||
|
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
|
||||||
|
table_id = match.group(1)
|
||||||
|
|
||||||
|
extended_video_page = self._get_page(
|
||||||
|
second_stage_url,
|
||||||
|
{f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if self._is_paginated_video_page(extended_video_page):
|
||||||
|
PRETTY.warning(
|
||||||
|
"800 elements do not seem to be enough (or I failed to fetch that many)."
|
||||||
|
" I will miss elements."
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page)
|
||||||
|
|
||||||
|
def _crawl_video_directory_second_stage(
|
||||||
|
self,
|
||||||
|
video_dir_path: Path,
|
||||||
|
video_list_soup: bs4.BeautifulSoup
|
||||||
|
) -> List[IliasDownloadInfo]:
|
||||||
|
"""
|
||||||
|
Crawls the "second stage" video page. This page contains the actual video urls.
|
||||||
|
"""
|
||||||
direct_download_links: List[bs4.Tag] = video_list_soup.findAll(
|
direct_download_links: List[bs4.Tag] = video_list_soup.findAll(
|
||||||
name="a", text=re.compile(r"\s*Download\s*")
|
name="a", text=re.compile(r"\s*Download\s*")
|
||||||
)
|
)
|
||||||
@ -416,8 +481,10 @@ class IliasCrawler:
|
|||||||
content_type = response.headers["content-type"]
|
content_type = response.headers["content-type"]
|
||||||
|
|
||||||
if not content_type.startswith("text/html"):
|
if not content_type.startswith("text/html"):
|
||||||
# TODO: Correct exception type
|
raise FatalException(
|
||||||
raise Exception(f"Invalid content type {content_type}")
|
f"Invalid content type {content_type} when crawling ilias page"
|
||||||
|
" {url!r} with {params!r}"
|
||||||
|
)
|
||||||
|
|
||||||
soup = soupify(response)
|
soup = soupify(response)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user