mirror of
https://github.com/Garmelon/PFERD.git
synced 2025-10-20 00:32:33 +02:00
Fix ruff errors
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from collections.abc import Callable
|
||||
from configparser import SectionProxy
|
||||
from typing import Callable, Dict
|
||||
|
||||
from ..auth import Authenticator
|
||||
from ..config import Config
|
||||
@@ -13,12 +13,12 @@ CrawlerConstructor = Callable[
|
||||
str, # Name (without the "crawl:" prefix)
|
||||
SectionProxy, # Crawler's section of global config
|
||||
Config, # Global config
|
||||
Dict[str, Authenticator], # Loaded authenticators by name
|
||||
dict[str, Authenticator], # Loaded authenticators by name
|
||||
],
|
||||
Crawler,
|
||||
]
|
||||
|
||||
CRAWLERS: Dict[str, CrawlerConstructor] = {
|
||||
CRAWLERS: dict[str, CrawlerConstructor] = {
|
||||
"local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c),
|
||||
"ilias-web": lambda n, s, c, a: IliasWebCrawler(n, IliasWebCrawlerSection(s), c, a),
|
||||
"kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a),
|
||||
|
@@ -1,10 +1,10 @@
|
||||
import asyncio
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Awaitable, Coroutine
|
||||
from collections.abc import Awaitable, Callable, Coroutine, Sequence
|
||||
from datetime import datetime
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar
|
||||
from typing import Any, Optional, TypeVar
|
||||
|
||||
from ..auth import Authenticator
|
||||
from ..config import Config, Section
|
||||
@@ -116,7 +116,7 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
|
||||
return bar
|
||||
|
||||
|
||||
class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
|
||||
class DownloadToken(ReusableAsyncContextManager[tuple[ProgressBar, FileSink]]):
|
||||
def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath):
|
||||
super().__init__()
|
||||
|
||||
@@ -128,7 +128,7 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
|
||||
def path(self) -> PurePath:
|
||||
return self._path
|
||||
|
||||
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
|
||||
async def _on_aenter(self) -> tuple[ProgressBar, FileSink]:
|
||||
await self._stack.enter_async_context(self._limiter.limit_download())
|
||||
sink = await self._stack.enter_async_context(self._fs_token)
|
||||
# The "Downloaded ..." message is printed in the output dir, not here
|
||||
@@ -205,7 +205,7 @@ class CrawlerSection(Section):
|
||||
on_windows = os.name == "nt"
|
||||
return self.s.getboolean("windows_paths", fallback=on_windows)
|
||||
|
||||
def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator:
|
||||
def auth(self, authenticators: dict[str, Authenticator]) -> Authenticator:
|
||||
value = self.s.get("auth")
|
||||
if value is None:
|
||||
self.missing_value("auth")
|
||||
@@ -262,7 +262,7 @@ class Crawler(ABC):
|
||||
return self._output_dir
|
||||
|
||||
@staticmethod
|
||||
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
|
||||
async def gather(awaitables: Sequence[Awaitable[Any]]) -> list[Any]:
|
||||
"""
|
||||
Similar to asyncio.gather. However, in the case of an exception, all
|
||||
still running tasks are cancelled and the exception is rethrown.
|
||||
@@ -394,7 +394,7 @@ class Crawler(ABC):
|
||||
log.warn("Couldn't find or load old report")
|
||||
return
|
||||
|
||||
seen: Set[PurePath] = set()
|
||||
seen: set[PurePath] = set()
|
||||
for known in sorted(self.prev_report.found_paths):
|
||||
looking_at = list(reversed(known.parents)) + [known]
|
||||
for path in looking_at:
|
||||
|
@@ -3,7 +3,7 @@ import http.cookies
|
||||
import ssl
|
||||
from datetime import datetime
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Any, Dict, List, Optional, Tuple, cast
|
||||
from typing import Any, Optional, cast
|
||||
|
||||
import aiohttp
|
||||
import certifi
|
||||
@@ -43,7 +43,7 @@ class HttpCrawler(Crawler):
|
||||
self._http_timeout = section.http_timeout()
|
||||
|
||||
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
|
||||
self._shared_cookie_jar_paths: Optional[List[Path]] = None
|
||||
self._shared_cookie_jar_paths: Optional[list[Path]] = None
|
||||
self._shared_auth = shared_auth
|
||||
|
||||
self._output_dir.register_reserved(self.COOKIE_FILE)
|
||||
@@ -98,7 +98,7 @@ class HttpCrawler(Crawler):
|
||||
"""
|
||||
raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
|
||||
|
||||
def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
|
||||
def share_cookies(self, shared: dict[Authenticator, list[Path]]) -> None:
|
||||
if not self._shared_auth:
|
||||
return
|
||||
|
||||
@@ -219,7 +219,7 @@ class HttpCrawler(Crawler):
|
||||
etags[str(path)] = etag
|
||||
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
|
||||
|
||||
async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]:
|
||||
async def _request_resource_version(self, resource_url: str) -> tuple[Optional[str], Optional[datetime]]:
|
||||
"""
|
||||
Requests the ETag and Last-Modified headers of a resource via a HEAD request.
|
||||
If no entity tag / modification date can be obtained, the according value will be None.
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
@@ -15,9 +16,9 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
|
||||
try:
|
||||
return await f(*args, **kwargs)
|
||||
except aiohttp.ContentTypeError: # invalid content type
|
||||
raise CrawlWarning("ILIAS returned an invalid content type")
|
||||
raise CrawlWarning("ILIAS returned an invalid content type") from None
|
||||
except aiohttp.TooManyRedirects:
|
||||
raise CrawlWarning("Got stuck in a redirect loop")
|
||||
raise CrawlWarning("Got stuck in a redirect loop") from None
|
||||
except aiohttp.ClientPayloadError as e: # encoding or not enough bytes
|
||||
last_exception = e
|
||||
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
|
||||
|
@@ -297,9 +297,7 @@ class Links(Enum):
|
||||
raise ValueError("Missing switch case")
|
||||
|
||||
def collection_as_one(self) -> bool:
|
||||
if self == Links.FANCY:
|
||||
return True
|
||||
return False
|
||||
return self == Links.FANCY
|
||||
|
||||
def extension(self) -> Optional[str]:
|
||||
if self == Links.FANCY:
|
||||
@@ -355,4 +353,4 @@ class Links(Enum):
|
||||
return Links(string)
|
||||
except ValueError:
|
||||
options = [f"'{option.value}'" for option in Links]
|
||||
raise ValueError(f"must be one of {', '.join(options)}")
|
||||
raise ValueError(f"must be one of {', '.join(options)}") from None
|
||||
|
@@ -4,7 +4,7 @@ import os
|
||||
import re
|
||||
from collections.abc import Awaitable, Coroutine
|
||||
from pathlib import PurePath
|
||||
from typing import Any, Dict, List, Literal, Optional, Set, Union, cast
|
||||
from typing import Any, Literal, Optional, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import aiohttp
|
||||
@@ -33,7 +33,7 @@ from .kit_ilias_html import (
|
||||
)
|
||||
from .shibboleth_login import ShibbolethLogin
|
||||
|
||||
TargetType = Union[str, int]
|
||||
TargetType = str | int
|
||||
|
||||
|
||||
class LoginTypeLocal:
|
||||
@@ -49,7 +49,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||
|
||||
return base_url
|
||||
|
||||
def login(self) -> Union[Literal["shibboleth"], LoginTypeLocal]:
|
||||
def login(self) -> Literal["shibboleth"] | LoginTypeLocal:
|
||||
login_type = self.s.get("login_type")
|
||||
if not login_type:
|
||||
self.missing_value("login_type")
|
||||
@@ -63,7 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||
|
||||
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
|
||||
|
||||
def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||
def tfa_auth(self, authenticators: dict[str, Authenticator]) -> Optional[Authenticator]:
|
||||
value: Optional[str] = self.s.get("tfa_auth")
|
||||
if value is None:
|
||||
return None
|
||||
@@ -110,7 +110,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||
return self.s.getboolean("forums", fallback=False)
|
||||
|
||||
|
||||
_DIRECTORY_PAGES: Set[IliasElementType] = {
|
||||
_DIRECTORY_PAGES: set[IliasElementType] = {
|
||||
IliasElementType.EXERCISE,
|
||||
IliasElementType.EXERCISE_FILES,
|
||||
IliasElementType.EXERCISE_OVERVIEW,
|
||||
@@ -122,7 +122,7 @@ _DIRECTORY_PAGES: Set[IliasElementType] = {
|
||||
IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED,
|
||||
}
|
||||
|
||||
_VIDEO_ELEMENTS: Set[IliasElementType] = {
|
||||
_VIDEO_ELEMENTS: set[IliasElementType] = {
|
||||
IliasElementType.MEDIACAST_VIDEO,
|
||||
IliasElementType.MEDIACAST_VIDEO_FOLDER,
|
||||
IliasElementType.OPENCAST_VIDEO,
|
||||
@@ -172,7 +172,7 @@ class IliasWebCrawler(HttpCrawler):
|
||||
name: str,
|
||||
section: IliasWebCrawlerSection,
|
||||
config: Config,
|
||||
authenticators: Dict[str, Authenticator],
|
||||
authenticators: dict[str, Authenticator],
|
||||
):
|
||||
# Setting a main authenticator for cookie sharing
|
||||
auth = section.auth(authenticators)
|
||||
@@ -201,7 +201,7 @@ instance's greatest bottleneck.
|
||||
self._links = section.links()
|
||||
self._videos = section.videos()
|
||||
self._forums = section.forums()
|
||||
self._visited_urls: Dict[str, PurePath] = dict()
|
||||
self._visited_urls: dict[str, PurePath] = dict()
|
||||
|
||||
async def _run(self) -> None:
|
||||
if isinstance(self._target, int):
|
||||
@@ -264,9 +264,9 @@ instance's greatest bottleneck.
|
||||
expected_course_id: Optional[int] = None,
|
||||
crawl_nested_courses: bool = False,
|
||||
) -> None:
|
||||
elements: List[IliasPageElement] = []
|
||||
elements: list[IliasPageElement] = []
|
||||
# A list as variable redefinitions are not propagated to outer scopes
|
||||
description: List[BeautifulSoup] = []
|
||||
description: list[BeautifulSoup] = []
|
||||
|
||||
@_iorepeat(3, "crawling folder")
|
||||
async def gather_elements() -> None:
|
||||
@@ -309,7 +309,7 @@ instance's greatest bottleneck.
|
||||
|
||||
elements.sort(key=lambda e: e.id())
|
||||
|
||||
tasks: List[Awaitable[None]] = []
|
||||
tasks: list[Awaitable[None]] = []
|
||||
for element in elements:
|
||||
if handle := await self._handle_ilias_element(cl.path, element, crawl_nested_courses):
|
||||
tasks.append(asyncio.create_task(handle))
|
||||
@@ -340,15 +340,14 @@ instance's greatest bottleneck.
|
||||
)
|
||||
return None
|
||||
|
||||
if element.type in _VIDEO_ELEMENTS:
|
||||
if not self._videos:
|
||||
log.status(
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](enable with option 'videos')",
|
||||
)
|
||||
return None
|
||||
if element.type in _VIDEO_ELEMENTS and not self._videos:
|
||||
log.status(
|
||||
"[bold bright_black]",
|
||||
"Ignored",
|
||||
fmt_path(element_path),
|
||||
"[bright_black](enable with option 'videos')",
|
||||
)
|
||||
return None
|
||||
|
||||
if element.type == IliasElementType.FILE:
|
||||
return await self._handle_file(element, element_path)
|
||||
@@ -522,8 +521,8 @@ instance's greatest bottleneck.
|
||||
sink.file.write(rendered.encode("utf-8"))
|
||||
sink.done()
|
||||
|
||||
async def _resolve_link_target(self, export_url: str) -> Union[BeautifulSoup, Literal["none"]]:
|
||||
async def impl() -> Optional[Union[BeautifulSoup, Literal["none"]]]:
|
||||
async def _resolve_link_target(self, export_url: str) -> BeautifulSoup | Literal["none"]:
|
||||
async def impl() -> Optional[BeautifulSoup | Literal["none"]]:
|
||||
async with self.session.get(export_url, allow_redirects=False) as resp:
|
||||
# No redirect means we were authenticated
|
||||
if hdrs.LOCATION not in resp.headers:
|
||||
@@ -658,7 +657,7 @@ instance's greatest bottleneck.
|
||||
|
||||
def _previous_contained_opencast_videos(
|
||||
self, element: IliasPageElement, element_path: PurePath
|
||||
) -> List[PurePath]:
|
||||
) -> list[PurePath]:
|
||||
if not self.prev_report:
|
||||
return []
|
||||
custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
|
||||
@@ -714,7 +713,7 @@ instance's greatest bottleneck.
|
||||
add_to_report([str(self._transformer.transform(dl.path))])
|
||||
return
|
||||
|
||||
contained_video_paths: List[str] = []
|
||||
contained_video_paths: list[str] = []
|
||||
|
||||
for stream_element in stream_elements:
|
||||
video_path = dl.path.parent / stream_element.name
|
||||
@@ -832,7 +831,7 @@ instance's greatest bottleneck.
|
||||
|
||||
elements = parse_ilias_forum_export(soupify(export))
|
||||
|
||||
tasks: List[Awaitable[None]] = []
|
||||
tasks: list[Awaitable[None]] = []
|
||||
for thread in elements:
|
||||
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, thread, element.url)))
|
||||
|
||||
@@ -842,7 +841,7 @@ instance's greatest bottleneck.
|
||||
@anoncritical
|
||||
@_iorepeat(3, "saving forum thread")
|
||||
async def _download_forum_thread(
|
||||
self, parent_path: PurePath, thread: Union[IliasForumThread, IliasPageElement], forum_url: str
|
||||
self, parent_path: PurePath, thread: IliasForumThread | IliasPageElement, forum_url: str
|
||||
) -> None:
|
||||
path = parent_path / (_sanitize_path_name(thread.name) + ".html")
|
||||
maybe_dl = await self.download(path, mtime=thread.mtime)
|
||||
@@ -871,7 +870,7 @@ instance's greatest bottleneck.
|
||||
@_iorepeat(3, "crawling learning module")
|
||||
@anoncritical
|
||||
async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None:
|
||||
elements: List[IliasLearningModulePage] = []
|
||||
elements: list[IliasLearningModulePage] = []
|
||||
|
||||
async with cl:
|
||||
log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}")
|
||||
@@ -891,7 +890,7 @@ instance's greatest bottleneck.
|
||||
for index, lm_element in enumerate(elements):
|
||||
lm_element.title = f"{index:02}_{lm_element.title}"
|
||||
|
||||
tasks: List[Awaitable[None]] = []
|
||||
tasks: list[Awaitable[None]] = []
|
||||
for index, elem in enumerate(elements):
|
||||
prev_url = elements[index - 1].title if index > 0 else None
|
||||
next_url = elements[index + 1].title if index < len(elements) - 1 else None
|
||||
@@ -906,10 +905,10 @@ instance's greatest bottleneck.
|
||||
self,
|
||||
path: PurePath,
|
||||
start_url: Optional[str],
|
||||
dir: Union[Literal["left"], Literal["right"]],
|
||||
dir: Literal["left"] | Literal["right"],
|
||||
parent_element: IliasPageElement,
|
||||
) -> List[IliasLearningModulePage]:
|
||||
elements: List[IliasLearningModulePage] = []
|
||||
) -> list[IliasLearningModulePage]:
|
||||
elements: list[IliasLearningModulePage] = []
|
||||
|
||||
if not start_url:
|
||||
return elements
|
||||
@@ -923,10 +922,7 @@ instance's greatest bottleneck.
|
||||
page = IliasPage(soup, parent_element)
|
||||
if next := page.get_learning_module_data():
|
||||
elements.append(next)
|
||||
if dir == "left":
|
||||
next_element_url = next.previous_url
|
||||
else:
|
||||
next_element_url = next.next_url
|
||||
next_element_url = next.previous_url if dir == "left" else next.next_url
|
||||
counter += 1
|
||||
|
||||
return elements
|
||||
@@ -950,16 +946,10 @@ instance's greatest bottleneck.
|
||||
|
||||
if prev:
|
||||
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
|
||||
if prev_p:
|
||||
prev = cast(str, os.path.relpath(prev_p, my_path.parent))
|
||||
else:
|
||||
prev = None
|
||||
prev = cast(str, os.path.relpath(prev_p, my_path.parent)) if prev_p else None
|
||||
if next:
|
||||
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
|
||||
if next_p:
|
||||
next = cast(str, os.path.relpath(next_p, my_path.parent))
|
||||
else:
|
||||
next = None
|
||||
next = cast(str, os.path.relpath(next_p, my_path.parent)) if next_p else None
|
||||
|
||||
async with maybe_dl as (bar, sink):
|
||||
content = element.content
|
||||
@@ -973,14 +963,13 @@ instance's greatest bottleneck.
|
||||
"""
|
||||
log.explain_topic("Internalizing images")
|
||||
for elem in tag.find_all(recursive=True):
|
||||
if elem.name == "img":
|
||||
if src := elem.attrs.get("src", None):
|
||||
url = urljoin(self._base_url, cast(str, src))
|
||||
if not url.startswith(self._base_url):
|
||||
continue
|
||||
log.explain(f"Internalizing {url!r}")
|
||||
img = await self._get_authenticated(url)
|
||||
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
|
||||
if elem.name == "img" and (src := elem.attrs.get("src", None)):
|
||||
url = urljoin(self._base_url, cast(str, src))
|
||||
if not url.startswith(self._base_url):
|
||||
continue
|
||||
log.explain(f"Internalizing {url!r}")
|
||||
img = await self._get_authenticated(url)
|
||||
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
|
||||
if elem.name == "iframe" and cast(str, elem.attrs.get("src", "")).startswith("//"):
|
||||
# For unknown reasons the protocol seems to be stripped.
|
||||
elem.attrs["src"] = "https:" + cast(str, elem.attrs["src"])
|
||||
@@ -1025,7 +1014,7 @@ instance's greatest bottleneck.
|
||||
)
|
||||
return soup
|
||||
|
||||
async def _post(self, url: str, data: dict[str, Union[str, List[str]]]) -> bytes:
|
||||
async def _post(self, url: str, data: dict[str, str | list[str]]) -> bytes:
|
||||
form_data = aiohttp.FormData()
|
||||
for key, val in data.items():
|
||||
form_data.add_field(key, val)
|
||||
|
@@ -1,9 +1,10 @@
|
||||
import json
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timedelta
|
||||
from enum import Enum
|
||||
from typing import Callable, Dict, Optional, Union, cast
|
||||
from typing import Optional, cast
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
@@ -13,7 +14,7 @@ from PFERD.crawl.crawler import CrawlWarning
|
||||
from PFERD.logging import log
|
||||
from PFERD.utils import url_set_query_params
|
||||
|
||||
TargetType = Union[str, int]
|
||||
TargetType = str | int
|
||||
|
||||
|
||||
class TypeMatcher:
|
||||
@@ -308,7 +309,7 @@ class IliasPageElement:
|
||||
"""
|
||||
|
||||
# This checks whether we can reach a `:` without passing a `-`
|
||||
if re.search(r"^[^-]+: ", meeting_name):
|
||||
if re.search(r"^[^-]+: ", meeting_name): # noqa: SIM108
|
||||
# Meeting name only contains date: "05. Jan 2000:"
|
||||
split_delimiter = ":"
|
||||
else:
|
||||
@@ -331,7 +332,7 @@ class IliasPageElement:
|
||||
@dataclass
|
||||
class IliasDownloadForumData:
|
||||
url: str
|
||||
form_data: Dict[str, Union[str, list[str]]]
|
||||
form_data: dict[str, str | list[str]]
|
||||
empty: bool
|
||||
|
||||
|
||||
@@ -433,21 +434,20 @@ class IliasPage:
|
||||
for p in paragraphs:
|
||||
if p.find_parent(class_=is_interesting_class):
|
||||
continue
|
||||
if "ilc_media_cont_MediaContainer" in p["class"]:
|
||||
if "ilc_media_cont_MediaContainer" in p["class"] and (video := p.select_one("video")):
|
||||
# We have an embedded video which should be downloaded by _find_mob_videos
|
||||
if video := p.select_one("video"):
|
||||
url, title = self._find_mob_video_url_title(video, p)
|
||||
raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
|
||||
raw_html += "display: flex; justify-content: center; align-items: center;"
|
||||
raw_html += ' margin: 0.5rem;">'
|
||||
if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
|
||||
if url.startswith("//"):
|
||||
url = "https:" + url
|
||||
raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
|
||||
else:
|
||||
raw_html += f"Video elided. Filename: '{title}'."
|
||||
raw_html += "</div>\n"
|
||||
continue
|
||||
url, title = self._find_mob_video_url_title(video, p)
|
||||
raw_html += '<div style="min-width: 100px; min-height: 100px; border: 1px solid black;'
|
||||
raw_html += "display: flex; justify-content: center; align-items: center;"
|
||||
raw_html += ' margin: 0.5rem;">'
|
||||
if url is not None and urlparse(url).hostname != urlparse(self._page_url).hostname:
|
||||
if url.startswith("//"):
|
||||
url = "https:" + url
|
||||
raw_html += f'<a href="{url}" target="_blank">External Video: {title}</a>'
|
||||
else:
|
||||
raw_html += f"Video elided. Filename: '{title}'."
|
||||
raw_html += "</div>\n"
|
||||
continue
|
||||
|
||||
# Ignore special listings (like folder groupings)
|
||||
if "ilc_section_Special" in p["class"]:
|
||||
@@ -794,7 +794,7 @@ class IliasPage:
|
||||
|
||||
is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None
|
||||
|
||||
if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER:
|
||||
if is_paginated and self._page_type != IliasElementType.OPENCAST_VIDEO_FOLDER:
|
||||
# We are in stage 2 - try to break pagination
|
||||
return self._find_opencast_video_entries_paginated()
|
||||
|
||||
@@ -1164,6 +1164,9 @@ class IliasPage:
|
||||
"""
|
||||
found_titles = []
|
||||
|
||||
if None == "hey":
|
||||
pass
|
||||
|
||||
outer_accordion_content: Optional[Tag] = None
|
||||
|
||||
parents: list[Tag] = list(tag.parents)
|
||||
@@ -1302,10 +1305,7 @@ class IliasPage:
|
||||
),
|
||||
)
|
||||
caption_container = caption_parent.find_next_sibling("div")
|
||||
if caption_container:
|
||||
description = caption_container.get_text().strip()
|
||||
else:
|
||||
description = None
|
||||
description = caption_container.get_text().strip() if caption_container else None
|
||||
|
||||
if not typ:
|
||||
_unexpected_html_warning()
|
||||
@@ -1444,9 +1444,7 @@ class IliasPage:
|
||||
return True
|
||||
# The individual video player wrapper page has nothing of the above.
|
||||
# Match it by its playerContainer.
|
||||
if soup.select_one("#playerContainer") is not None:
|
||||
return True
|
||||
return False
|
||||
return soup.select_one("#playerContainer") is not None
|
||||
|
||||
@staticmethod
|
||||
def _find_date_in_text(text: str) -> Optional[datetime]:
|
||||
@@ -1505,11 +1503,11 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti
|
||||
# Normalize whitespace because users
|
||||
date_str = re.sub(r"\s+", " ", date_str)
|
||||
|
||||
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I)
|
||||
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I)
|
||||
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I)
|
||||
date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, flags=re.I)
|
||||
date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, flags=re.I)
|
||||
date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, flags=re.I)
|
||||
date_str = date_str.strip()
|
||||
for german, english in zip(german_months, english_months):
|
||||
for german, english in zip(german_months, english_months, strict=True):
|
||||
date_str = date_str.replace(german, english)
|
||||
# Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020"
|
||||
date_str = date_str.replace(english + ".", english)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
from typing import Dict, Literal
|
||||
from typing import Literal
|
||||
|
||||
from ...auth import Authenticator
|
||||
from ...config import Config
|
||||
@@ -26,7 +26,7 @@ class KitIliasWebCrawler(IliasWebCrawler):
|
||||
name: str,
|
||||
section: KitIliasWebCrawlerSection,
|
||||
config: Config,
|
||||
authenticators: Dict[str, Authenticator],
|
||||
authenticators: dict[str, Authenticator],
|
||||
):
|
||||
super().__init__(name, section, config, authenticators)
|
||||
|
||||
|
@@ -1,9 +1,11 @@
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Awaitable, Generator, Iterable
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import PurePath
|
||||
from typing import Any, Awaitable, Generator, Iterable, List, Optional, Pattern, Tuple, Union, cast
|
||||
from re import Pattern
|
||||
from typing import Any, Optional, Union, cast
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
@@ -44,7 +46,7 @@ class KitIpdFile:
|
||||
@dataclass
|
||||
class KitIpdFolder:
|
||||
name: str
|
||||
entries: List[Union[KitIpdFile, "KitIpdFolder"]]
|
||||
entries: list[Union[KitIpdFile, "KitIpdFolder"]]
|
||||
|
||||
def explain(self) -> None:
|
||||
log.explain_topic(f"Folder {self.name!r}")
|
||||
@@ -68,7 +70,7 @@ class KitIpdCrawler(HttpCrawler):
|
||||
if not maybe_cl:
|
||||
return
|
||||
|
||||
tasks: List[Awaitable[None]] = []
|
||||
tasks: list[Awaitable[None]] = []
|
||||
|
||||
async with maybe_cl:
|
||||
for item in await self._fetch_items():
|
||||
@@ -120,9 +122,9 @@ class KitIpdCrawler(HttpCrawler):
|
||||
async with maybe_dl as (bar, sink):
|
||||
await self._stream_from_url(file.url, element_path, sink, bar)
|
||||
|
||||
async def _fetch_items(self) -> Iterable[Union[KitIpdFile, KitIpdFolder]]:
|
||||
async def _fetch_items(self) -> Iterable[KitIpdFile | KitIpdFolder]:
|
||||
page, url = await self.get_page()
|
||||
elements: List[Tag] = self._find_file_links(page)
|
||||
elements: list[Tag] = self._find_file_links(page)
|
||||
|
||||
# do not add unnecessary nesting for a single <h1> heading
|
||||
drop_h1: bool = len(page.find_all(name="h1")) <= 1
|
||||
@@ -151,7 +153,7 @@ class KitIpdCrawler(HttpCrawler):
|
||||
name = os.path.basename(url)
|
||||
return KitIpdFile(name, url)
|
||||
|
||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> list[Tag]:
|
||||
def _find_file_links(self, tag: Tag | BeautifulSoup) -> list[Tag]:
|
||||
return cast(list[Tag], tag.find_all(name="a", attrs={"href": self._file_regex}))
|
||||
|
||||
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
||||
@@ -172,7 +174,7 @@ class KitIpdCrawler(HttpCrawler):
|
||||
|
||||
self._add_etag_to_report(path, resp.headers.get("ETag"))
|
||||
|
||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||
async def get_page(self) -> tuple[BeautifulSoup, str]:
|
||||
async with self.session.get(self._url) as request:
|
||||
# The web page for Algorithmen für Routenplanung contains some
|
||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||
|
Reference in New Issue
Block a user