Implement video downloads in ilias crawler

This commit is contained in:
I-Al-Istannen 2021-05-15 21:29:43 +02:00
parent c7494e32ce
commit 7d323ec62b

View File

@ -6,12 +6,14 @@ from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from pathlib import PurePath from pathlib import PurePath
from typing import Any, Dict, List, Optional, Set, Union # TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import Any, AsyncContextManager, Dict, List, Optional, Set, Union
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
urlunsplit) urlunsplit)
import aiohttp import aiohttp
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from PFERD.output_dir import Redownload
from PFERD.utils import soupify from PFERD.utils import soupify
from ..authenticators import Authenticator from ..authenticators import Authenticator
@ -19,6 +21,7 @@ from ..conductor import TerminalConductor
from ..config import Config from ..config import Config
from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical, from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical,
arepeat) arepeat)
from ..output_dir import FileSink
TargetType = Union[str, int] TargetType = Union[str, int]
@ -438,6 +441,9 @@ class IliasCrawler(HttpCrawler):
else: else:
await self._crawl_url(self._target) await self._crawl_url(self._target)
if self.error_free:
await self.cleanup()
async def _crawl_course(self, course_id: int) -> None: async def _crawl_course(self, course_id: int) -> None:
# Start crawling at the given course # Start crawling at the given course
root_url = _url_set_query_param( root_url = _url_set_query_param(
@ -483,7 +489,7 @@ class IliasCrawler(HttpCrawler):
element_path = PurePath(parent_path, element.name) element_path = PurePath(parent_path, element.name)
if element.type == IliasElementType.FILE: if element.type == IliasElementType.FILE:
await self._download_element(element, element_path) await self._download_file(element, element_path)
elif element.type == IliasElementType.FORUM: elif element.type == IliasElementType.FORUM:
# TODO: Delete # TODO: Delete
self.print(f"Skipping forum [green]{element_path}[/]") self.print(f"Skipping forum [green]{element_path}[/]")
@ -491,33 +497,50 @@ class IliasCrawler(HttpCrawler):
# TODO: Write in meta-redirect file # TODO: Write in meta-redirect file
self.print(f"Skipping link [green]{element_path}[/]") self.print(f"Skipping link [green]{element_path}[/]")
elif element.type == IliasElementType.VIDEO: elif element.type == IliasElementType.VIDEO:
await self._download_element(element, element_path) await self._download_file(element, element_path)
elif element.type == IliasElementType.VIDEO_PLAYER: elif element.type == IliasElementType.VIDEO_PLAYER:
# FIXME: Check if we should look at this and if not bail out already! await self._download_video(element, element_path)
# This saves us a request for each video, if we skip them anyways
raise RuntimeError("IMPLEMENT ME")
elif element.type in _DIRECTORY_PAGES: elif element.type in _DIRECTORY_PAGES:
await self._handle_ilias_page(element.url, element, element_path) await self._handle_ilias_page(element.url, element, element_path)
else: else:
# TODO: Proper exception # TODO: Proper exception
raise RuntimeError(f"Unknown type: {element.type!r}") raise RuntimeError(f"Unknown type: {element.type!r}")
async def _download_element(self, element: IliasPageElement, element_path: PurePath) -> None: async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
# Videos will NOT be redownloaded - their content doesn't really change and they are chunky
dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER)
if not dl:
return
async with self.download_bar(element_path) as bar:
page = IliasPage(await self._get_page(element.url), element.url, element)
real_element = page.get_child_elements()[0]
async with dl as sink, self.session.get(element.url) as resp:
if resp.content_length:
bar.set_total(resp.content_length)
async for data in resp.content.iter_chunked(1024):
sink.file.write(data)
bar.advance(len(data))
sink.done()
async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
dl = await self.download(element_path, mtime=element.mtime) dl = await self.download(element_path, mtime=element.mtime)
if not dl: if not dl:
return return
async with self.download_bar(element_path) as bar, dl as sink,\ async with self.download_bar(element_path) as bar:
self.session.get(element.url) as resp: async with dl as sink, self.session.get(element.url) as resp:
if resp.content_length:
bar.set_total(resp.content_length)
if resp.content_length: async for data in resp.content.iter_chunked(1024):
bar.set_total(resp.content_length) sink.file.write(data)
bar.advance(len(data))
async for data in resp.content.iter_chunked(1024): sink.done()
sink.file.write(data)
bar.advance(len(data))
sink.done()
async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup:
if retries_left < 0: if retries_left < 0: