Compare commits

..

1 Commits

Author SHA1 Message Date
I-Al-Istannen
2d145e7c94 Check for new versions at startup 2022-10-24 17:31:34 +02:00
20 changed files with 406 additions and 220 deletions

View File

@@ -17,9 +17,9 @@ jobs:
python: ["3.9"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v2
- uses: actions/setup-python@v4
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python }}
@@ -45,7 +45,7 @@ jobs:
run: mv dist/pferd* dist/pferd-${{ matrix.os }}
- name: Upload binary
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v2
with:
name: Binaries
path: dist/pferd-${{ matrix.os }}
@@ -57,7 +57,7 @@ jobs:
steps:
- name: Download binaries
uses: actions/download-artifact@v3
uses: actions/download-artifact@v2
with:
name: Binaries

1
.gitignore vendored
View File

@@ -2,6 +2,7 @@
/.venv/
/PFERD.egg-info/
__pycache__/
/.vscode/
# pyinstaller
/pferd.spec

View File

@@ -1,8 +0,0 @@
{
"files.insertFinalNewline": true,
"files.trimFinalNewlines": true,
"python.formatting.provider": "autopep8",
"python.linting.enabled": true,
"python.linting.flake8Enabled": true,
"python.linting.mypyEnabled": true,
}

View File

@@ -23,34 +23,8 @@ ambiguous situations.
## Unreleased
### Fixed
- Crawling of courses with the timeline view as the default tab
- Crawling of file and custom opencast cards
- Crawling of button cards without descriptions
## 3.4.3 - 2022-11-29
### Added
- Missing documentation for `forums` option
### Changed
- Clear up error message shown when multiple paths are found to an element
### Fixed
- IPD crawler unnecessarily appending trailing slashes
- Crawling opencast when ILIAS is set to English
## 3.4.2 - 2022-10-26
### Added
- Recognize and crawl content pages in cards
- Recognize and ignore surveys
### Fixed
- Forum crawling crashing when a thread has no messages at all
- Forum crawling crashing when parsing empty (= 0 messages) threads
- Forum crawling crashing when a forum has no threads at all
- Ilias login failing in some cases
- Crawling of paginated future meetings
- IPD crawler handling of URLs without trailing slash
## 3.4.1 - 2022-08-17

View File

@@ -181,7 +181,6 @@ script once per day should be fine.
redirect to the actual URL. Set to a negative value to disable the automatic
redirect. (Default: `-1`)
- `videos`: Whether to download videos. (Default: `no`)
- `forums`: Whether to download forum threads. (Default: `no`)
- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
`20.0`)
@@ -290,7 +289,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
Example: `foo/bar --> baz`
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`
- Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `baz/wargl`
- Converts `foo/bar/wargl` into `bar/wargl`
Example: `foo/bar --> !`
- Doesn't match `foo`, `a/foo/bar` or `foo/baz`

View File

@@ -5,6 +5,8 @@ import os
import sys
from pathlib import Path
from PFERD.update import check_for_updates
from .auth import AuthLoadError
from .cli import PARSER, ParserLoadError, load_default_section
from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
@@ -134,6 +136,11 @@ def main() -> None:
loop.run_until_complete(asyncio.sleep(1))
loop.close()
else:
log.explain_topic("Checking for updates")
if not args.skip_update_check:
asyncio.run(check_for_updates())
else:
log.explain("Update check skipped due to configuration option")
asyncio.run(pferd.run(args.debug_transforms))
except (ConfigOptionError, AuthLoadError) as e:
log.unlock()

View File

@@ -151,6 +151,11 @@ PARSER.add_argument(
action="version",
version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
)
PARSER.add_argument(
"--skip-update-check",
action="store_true",
help="disable automatic update checks at startup"
)
PARSER.add_argument(
"--config", "-c",
type=Path,

View File

@@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Ty
from ..auth import Authenticator
from ..config import Config, Section
from ..deduplicator import Deduplicator
from ..limiter import Limiter
from ..logging import ProgressBar, log
from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
from ..report import MarkConflictError, MarkDuplicateError, Report
@@ -97,9 +98,10 @@ def anoncritical(f: AWrapped) -> AWrapped:
class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
def __init__(self, path: PurePath):
def __init__(self, limiter: Limiter, path: PurePath):
super().__init__()
self._limiter = limiter
self._path = path
@property
@@ -108,15 +110,17 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
async def _on_aenter(self) -> ProgressBar:
self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path)))
await self._stack.enter_async_context(self._limiter.limit_crawl())
bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path)))
return bar
class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
def __init__(self, fs_token: FileSinkToken, path: PurePath):
def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath):
super().__init__()
self._limiter = limiter
self._fs_token = fs_token
self._path = path
@@ -125,6 +129,7 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
return self._path
async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
await self._stack.enter_async_context(self._limiter.limit_download())
sink = await self._stack.enter_async_context(self._fs_token)
# The "Downloaded ..." message is printed in the output dir, not here
bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading",
@@ -230,6 +235,12 @@ class Crawler(ABC):
self.name = name
self.error_free = True
self._limiter = Limiter(
task_limit=section.tasks(),
download_limit=section.downloads(),
task_delay=section.task_delay(),
)
self._deduplicator = Deduplicator(section.windows_paths())
self._transformer = Transformer(section.transform())
@@ -277,7 +288,7 @@ class Crawler(ABC):
return None
log.explain("Answer: Yes")
return CrawlToken(path)
return CrawlToken(self._limiter, path)
async def download(
self,
@@ -302,7 +313,7 @@ class Crawler(ABC):
return None
log.explain("Answer: Yes")
return DownloadToken(fs_token, path)
return DownloadToken(self._limiter, fs_token, path)
async def _cleanup(self) -> None:
log.explain_topic("Decision: Clean up files")

View File

@@ -1,9 +1,12 @@
import asyncio
from http.cookiejar import LWPCookieJar
import http.cookies
import ssl
from pathlib import Path, PurePath
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional
import requests
import aiohttp
import certifi
from aiohttp.client import ClientTimeout
from ..auth import Authenticator
from ..config import Config
@@ -32,9 +35,9 @@ class HttpCrawler(Crawler):
self._authentication_id = 0
self._authentication_lock = asyncio.Lock()
self._http_timeout = section.http_timeout() # TODO Use or remove
self._request_count = 0
self._http_timeout = section.http_timeout()
self._cookie_jar = LWPCookieJar()
self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
self._shared_cookie_jar_paths: Optional[List[Path]] = None
self._shared_auth = shared_auth
@@ -54,6 +57,7 @@ class HttpCrawler(Crawler):
# This should reduce the amount of requests we make: If an authentication is in progress
# all future requests wait for authentication to complete.
async with self._authentication_lock:
self._request_count += 1
return self._authentication_id
async def authenticate(self, caller_auth_id: int) -> None:
@@ -102,13 +106,32 @@ class HttpCrawler(Crawler):
self._shared_cookie_jar_paths.append(self._cookie_jar_path)
def _load_cookies_from_file(self, path: Path) -> None:
jar: Any = http.cookies.SimpleCookie()
with open(path, encoding="utf-8") as f:
for i, line in enumerate(f):
# Names of headers are case insensitive
if line[:11].lower() == "set-cookie:":
jar.load(line[11:])
else:
log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it")
self._cookie_jar.update_cookies(jar)
def _save_cookies_to_file(self, path: Path) -> None:
jar: Any = http.cookies.SimpleCookie()
for morsel in self._cookie_jar:
jar[morsel.key] = morsel
with open(path, "w", encoding="utf-8") as f:
f.write(jar.output(sep="\n"))
f.write("\n") # A trailing newline is just common courtesy
def _load_cookies(self) -> None:
log.explain_topic("Loading cookies")
cookie_jar_path: Optional[Path] = None
if self._shared_cookie_jar_paths is None:
log.explain("Not sharing cookies")
log.explain("Not sharing any cookies")
cookie_jar_path = self._cookie_jar_path
else:
log.explain("Sharing cookies")
@@ -131,38 +154,46 @@ class HttpCrawler(Crawler):
log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
try:
self._cookie_jar.load(filename=str(cookie_jar_path))
self._load_cookies_from_file(cookie_jar_path)
except Exception as e:
log.explain(f"Failed to load cookies: {e}")
log.explain("Proceeding without cookies")
log.explain("Failed to load cookies")
log.explain(str(e))
def _save_cookies(self) -> None:
log.explain_topic("Saving cookies")
try:
log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
self._cookie_jar.save(filename=str(self._cookie_jar_path))
self._save_cookies_to_file(self._cookie_jar_path)
except Exception as e:
log.warn(f"Failed to save cookies: {e}")
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
log.warn(str(e))
async def run(self) -> None:
self._request_count = 0
self._cookie_jar = aiohttp.CookieJar()
self._load_cookies()
self.session = requests.Session()
self.session.headers["User-Agent"] = f"{NAME}/{VERSION}"
# From the request docs: "All requests code should work out of the box
# with externally provided instances of CookieJar, e.g. LWPCookieJar and
# FileCookieJar."
# https://requests.readthedocs.io/en/latest/api/#requests.cookies.RequestsCookieJar
self.session.cookies = self._cookie_jar # type: ignore
with self.session:
async with aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"},
cookie_jar=self._cookie_jar,
connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
timeout=ClientTimeout(
# 30 minutes. No download in the history of downloads was longer than 30 minutes.
# This is enough to transfer a 600 MB file over a 3 Mib/s connection.
# Allowing an arbitrary value could be annoying for overnight batch jobs
total=15 * 60,
connect=self._http_timeout,
sock_connect=self._http_timeout,
sock_read=self._http_timeout,
)
) as session:
self.session = session
try:
await super().run()
finally:
del self.session
log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
# They are saved in authenticate, but a final save won't hurt
self._save_cookies()

View File

@@ -24,7 +24,6 @@ class IliasElementType(Enum):
LINK = "link"
BOOKING = "booking"
MEETING = "meeting"
SURVEY = "survey"
VIDEO = "video"
VIDEO_PLAYER = "video_player"
VIDEO_FOLDER = "video_folder"
@@ -134,7 +133,7 @@ class IliasPage:
thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
form_data: Dict[str, Union[str, List[str]]] = {
form_data: Dict[str, Union[str, List[ſtr]]] = {
"thread_ids[]": thread_ids,
"selected_cmd2": "html",
"select_cmd2": "Ausführen",
@@ -158,8 +157,6 @@ class IliasPage:
if self._contains_collapsed_future_meetings():
log.explain("Requesting *all* future meetings")
return self._uncollapse_future_meetings_url()
if not self._is_content_tab_selected():
return self._select_content_page_url()
return None
def _is_forum_page(self) -> bool:
@@ -222,27 +219,6 @@ class IliasPage:
link = self._abs_url_from_link(element)
return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
def _is_content_tab_selected(self) -> bool:
return self._select_content_page_url() is None
def _select_content_page_url(self) -> Optional[IliasPageElement]:
tab = self._soup.find(
id="tab_view_content",
attrs={"class": lambda x: x is not None and "active" not in x}
)
# Already selected (or not found)
if not tab:
return None
link = tab.find("a")
if link:
link = self._abs_url_from_link(link)
return IliasPageElement(IliasElementType.FOLDER, link, "select content page")
_unexpected_html_warning()
log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.")
log.warn_contd("PFERD might not find content on the course's main page.")
return None
def _player_to_video(self) -> List[IliasPageElement]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
@@ -389,7 +365,7 @@ class IliasPage:
"""
# Video start links are marked with an "Abspielen" link
video_links: List[Tag] = self._soup.findAll(
name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
name="a", text=re.compile(r"\s*Abspielen\s*")
)
results: List[IliasPageElement] = []
@@ -708,11 +684,7 @@ class IliasPage:
"div",
attrs={"class": lambda x: x and "caption" in x},
)
caption_container = caption_parent.find_next_sibling("div")
if caption_container:
description = caption_container.getText().strip()
else:
description = None
description = caption_parent.find_next_sibling("div").getText().strip()
if not type:
_unexpected_html_warning()
@@ -742,7 +714,7 @@ class IliasPage:
icon: Tag = card_root.select_one(".il-card-repository-head .icon")
if "opencast" in icon["class"] or "xoct" in icon["class"]:
if "opencast" in icon["class"]:
return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED
if "exc" in icon["class"]:
return IliasElementType.EXERCISE
@@ -758,12 +730,6 @@ class IliasPage:
return IliasElementType.TEST
if "fold" in icon["class"]:
return IliasElementType.FOLDER
if "copa" in icon["class"]:
return IliasElementType.FOLDER
if "svy" in icon["class"]:
return IliasElementType.SURVEY
if "file" in icon["class"]:
return IliasElementType.FILE
_unexpected_html_warning()
log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")

View File

@@ -126,6 +126,13 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
return decorator
def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]:
"""
Wraps any I/O exception in a CrawlWarning.
"""
return _iorepeat(1, name)
# Crawler control flow:
#
# crawl_desktop -+
@@ -187,7 +194,7 @@ instance's greatest bottleneck.
self._links = section.links()
self._videos = section.videos()
self._forums = section.forums()
self._visited_urls: Dict[str, PurePath] = dict()
self._visited_urls: Set[str] = set()
async def _run(self) -> None:
if isinstance(self._target, int):
@@ -219,45 +226,114 @@ instance's greatest bottleneck.
return
cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
def ensure_is_valid_course_id(parent: Optional[IliasPageElement], soup: BeautifulSoup) -> None:
if parent is None and expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes
description: List[BeautifulSoup] = []
await self._crawl_ilias_page(url, None, cl, ensure_is_valid_course_id)
@_iorepeat(3, "crawling url")
async def gather_elements() -> None:
elements.clear()
async with cl:
next_stage_url: Optional[str] = url
current_parent = None
# Duplicated code, but the root page is special - we want to avoid fetching it twice!
while next_stage_url:
soup = await self._get_page(next_stage_url)
if current_parent is None and expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
page = IliasPage(soup, next_stage_url, current_parent)
if next_element := page.get_next_stage_element():
current_parent = next_element
next_stage_url = next_element.url
else:
next_stage_url = None
elements.extend(page.get_child_elements())
if description_string := page.get_description():
description.append(description_string)
# Fill up our task list with the found elements
await gather_elements()
if description:
await self._download_description(PurePath("."), description[0])
elements.sort(key=lambda e: e.id())
tasks: List[Awaitable[None]] = []
for element in elements:
if handle := await self._handle_ilias_element(PurePath("."), element):
tasks.append(asyncio.create_task(handle))
# And execute them
await self.gather(tasks)
async def _handle_ilias_page(
self,
url: str,
parent: IliasPageElement,
path: PurePath,
) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(path)
if not maybe_cl:
return None
return self._crawl_ilias_page(url, parent, maybe_cl)
@anoncritical
async def _crawl_ilias_page(
self,
url: str,
parent: Optional[IliasPageElement],
parent: IliasPageElement,
cl: CrawlToken,
next_stage_hook: Callable[[Optional[IliasPageElement], BeautifulSoup], None] = lambda a, b: None
) -> None:
async with cl:
next_stage_url: Optional[str] = url
current_parent = parent
elements: List[IliasPageElement] = []
# A list as variable redefinitions are not propagated to outer scopes
description: List[BeautifulSoup] = []
while next_stage_url:
soup = await self._get_page(next_stage_url)
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
@_iorepeat(3, "crawling folder")
async def gather_elements() -> None:
elements.clear()
async with cl:
next_stage_url: Optional[str] = url
current_parent = parent
next_stage_hook(current_parent, soup)
while next_stage_url:
soup = await self._get_page(next_stage_url)
log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
log.explain(f"URL: {next_stage_url}")
page = IliasPage(soup, next_stage_url, current_parent)
if next_element := page.get_next_stage_element():
current_parent = next_element
next_stage_url = next_element.url
else:
next_stage_url = None
page = IliasPage(soup, next_stage_url, current_parent)
if next_element := page.get_next_stage_element():
current_parent = next_element
next_stage_url = next_element.url
else:
next_stage_url = None
elements.extend(page.get_child_elements())
if description_string := page.get_description():
description.append(description_string)
for element in sorted(page.get_child_elements(), key=lambda e: e.id()):
await self._handle_ilias_element(cl.path, element)
# Fill up our task list with the found elements
await gather_elements()
if description_string := page.get_description():
await self._download_description(cl.path, description_string)
if description:
await self._download_description(cl.path, description[0])
elements.sort(key=lambda e: e.id())
tasks: List[Awaitable[None]] = []
for element in elements:
if handle := await self._handle_ilias_element(cl.path, element):
tasks.append(asyncio.create_task(handle))
# And execute them
await self.gather(tasks)
# These decorators only apply *to this method* and *NOT* to the returned
# awaitables!
@@ -269,14 +345,12 @@ instance's greatest bottleneck.
self,
parent_path: PurePath,
element: IliasPageElement,
) -> None:
) -> Optional[Coroutine[Any, Any, None]]:
if element.url in self._visited_urls:
raise CrawlWarning(
f"Found second path to element {element.name!r} at {element.url!r}. "
+ f"First path: {fmt_path(self._visited_urls[element.url])}. "
+ f"Second path: {fmt_path(parent_path)}."
f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
)
self._visited_urls[element.url] = parent_path
self._visited_urls.add(element.url)
element_path = PurePath(parent_path, element.name)
@@ -291,7 +365,7 @@ instance's greatest bottleneck.
return None
if element.type == IliasElementType.FILE:
await self._handle_file(element, element_path)
return await self._handle_file(element, element_path)
elif element.type == IliasElementType.FORUM:
if not self._forums:
log.status(
@@ -301,36 +375,22 @@ instance's greatest bottleneck.
"[bright_black](enable with option 'forums')"
)
return None
await self._handle_forum(element, element_path)
return await self._handle_forum(element, element_path)
elif element.type == IliasElementType.TEST:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](tests contain no relevant data)"
)
return None
elif element.type == IliasElementType.SURVEY:
log.status(
"[bold bright_black]",
"Ignored",
fmt_path(element_path),
"[bright_black](surveys contain no relevant data)"
)
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
log.explain("Tests contain no relevant files")
log.explain("Answer: No")
return None
elif element.type == IliasElementType.LINK:
await self._handle_link(element, element_path)
return await self._handle_link(element, element_path)
elif element.type == IliasElementType.BOOKING:
await self._handle_booking(element, element_path)
return await self._handle_booking(element, element_path)
elif element.type == IliasElementType.VIDEO:
await self._handle_file(element, element_path)
return await self._handle_file(element, element_path)
elif element.type == IliasElementType.VIDEO_PLAYER:
await self._handle_video(element, element_path)
return await self._handle_video(element, element_path)
elif element.type in _DIRECTORY_PAGES:
maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None
await self._crawl_ilias_page(element.url, element, maybe_cl)
return await self._handle_ilias_page(element.url, element, element_path)
else:
# This will retry it a few times, failing everytime. It doesn't make any network
# requests, so that's fine.
@@ -340,7 +400,7 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> None:
) -> Optional[Coroutine[Any, Any, None]]:
log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
log.explain(f"Links type is {self._links}")
@@ -357,7 +417,7 @@ instance's greatest bottleneck.
if not maybe_dl:
return None
await self._download_link(element, link_template_maybe, maybe_dl)
return self._download_link(element, link_template_maybe, maybe_dl)
@anoncritical
@_iorepeat(3, "resolving link")
@@ -449,7 +509,7 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> None:
) -> Optional[Coroutine[Any, Any, None]]:
# Copy old mapping as it is likely still relevant
if self.prev_report:
self.report.add_custom_value(
@@ -475,7 +535,7 @@ instance's greatest bottleneck.
return None
await self._download_video(element_path, element, maybe_dl)
return self._download_video(element_path, element, maybe_dl)
def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]:
if not self.prev_report:
@@ -557,11 +617,11 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> None:
) -> Optional[Coroutine[Any, Any, None]]:
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
return None
await self._download_file(element, maybe_dl)
return self._download_file(element, maybe_dl)
@anoncritical
@_iorepeat(3, "downloading file")
@@ -604,11 +664,11 @@ instance's greatest bottleneck.
self,
element: IliasPageElement,
element_path: PurePath,
) -> None:
) -> Optional[Coroutine[Any, Any, None]]:
maybe_cl = await self.crawl(element_path)
if not maybe_cl:
return None
await self._crawl_forum(element, maybe_cl)
return self._crawl_forum(element, maybe_cl)
@_iorepeat(3, "crawling forum")
@anoncritical

View File

@@ -2,7 +2,7 @@ import os
import re
from dataclasses import dataclass
from pathlib import PurePath
from typing import List, Optional, Pattern, Set, Tuple, Union
from typing import Awaitable, List, Optional, Pattern, Set, Union
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag
@@ -64,62 +64,67 @@ class KitIpdCrawler(HttpCrawler):
self._file_regex = section.link_regex()
async def _run(self) -> None:
cl = await self.crawl(PurePath("."))
if not cl:
maybe_cl = await self.crawl(PurePath("."))
if not maybe_cl:
return
async with cl:
tasks: List[Awaitable[None]] = []
async with maybe_cl:
for item in await self._fetch_items():
if isinstance(item, KitIpdFolder):
await self._crawl_folder(item)
tasks.append(self._crawl_folder(item))
else:
# Orphan files are placed in the root folder
await self._download_file(PurePath("."), item)
tasks.append(self._download_file(PurePath("."), item))
await self.gather(tasks)
async def _crawl_folder(self, folder: KitIpdFolder) -> None:
path = PurePath(folder.name)
if not await self.crawl(path):
return
for file in folder.files:
await self._download_file(path, file)
tasks = [self._download_file(path, file) for file in folder.files]
await self.gather(tasks)
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name
dl = await self.download(element_path)
if not dl:
maybe_dl = await self.download(element_path)
if not maybe_dl:
return
async with dl as (bar, sink):
async with maybe_dl as (bar, sink):
await self._stream_from_url(file.url, sink, bar)
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
page, url = await self._get_page()
page = await self.get_page()
elements: List[Tag] = self._find_file_links(page)
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
for element in elements:
folder_label = self._find_folder_label(element)
if folder_label:
folder = self._extract_folder(folder_label, url)
folder = self._extract_folder(folder_label)
if folder not in items:
items.add(folder)
folder.explain()
else:
file = self._extract_file(element, url)
file = self._extract_file(element)
items.add(file)
log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
log.explain("Attributing it to root folder")
return items
def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
files: List[KitIpdFile] = []
name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container):
files.append(self._extract_file(link, url))
files.append(self._extract_file(link))
return KitIpdFolder(name, files)
@@ -130,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
return None
return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
url = self._abs_url_from_link(url, link)
def _extract_file(self, link: Tag) -> KitIpdFile:
url = self._abs_url_from_link(link)
name = os.path.basename(url)
return KitIpdFile(name, url)
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex})
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
return urljoin(url, link_tag.get("href"))
def _abs_url_from_link(self, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href"))
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp:
@@ -154,12 +159,12 @@ class KitIpdCrawler(HttpCrawler):
sink.done()
async def _get_page(self) -> Tuple[BeautifulSoup, str]:
response = self.session.get(self._url)
# The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This
# hack enables those pages to be crawled, and should hopefully not
# cause issues on other pages.
content = re.sub(r"<!--.*?-->", "", response.text)
return soupify(content.encode("utf-8")), str(request.url)
async def get_page(self) -> BeautifulSoup:
async with self.session.get(self._url) as request:
# The web page for Algorithmen für Routenplanung contains some
# weird comments that beautifulsoup doesn't parse correctly. This
# hack enables those pages to be crawled, and should hopefully not
# cause issues on other pages.
content = (await request.read()).decode("utf-8")
content = re.sub(r"<!--.*?-->", "", content)
return soupify(content.encode("utf-8"))

View File

@@ -71,6 +71,8 @@ class LocalCrawler(Crawler):
if not cl:
return
tasks = []
async with cl:
await asyncio.sleep(random.uniform(
0.5 * self._crawl_delay,
@@ -79,7 +81,9 @@ class LocalCrawler(Crawler):
for child in path.iterdir():
pure_child = cl.path / child.name
await self._crawl_path(child, pure_child)
tasks.append(self._crawl_path(child, pure_child))
await self.gather(tasks)
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
stat = path.stat()

97
PFERD/limiter.py Normal file
View File

@@ -0,0 +1,97 @@
import asyncio
import time
from contextlib import asynccontextmanager
from dataclasses import dataclass
from typing import AsyncIterator, Optional
@dataclass
class Slot:
active: bool = False
last_left: Optional[float] = None
class Limiter:
def __init__(
self,
task_limit: int,
download_limit: int,
task_delay: float
):
if task_limit <= 0:
raise ValueError("task limit must be at least 1")
if download_limit <= 0:
raise ValueError("download limit must be at least 1")
if download_limit > task_limit:
raise ValueError("download limit can't be greater than task limit")
if task_delay < 0:
raise ValueError("Task delay must not be negative")
self._slots = [Slot() for _ in range(task_limit)]
self._downloads = download_limit
self._delay = task_delay
self._condition = asyncio.Condition()
def _acquire_slot(self) -> Optional[Slot]:
for slot in self._slots:
if not slot.active:
slot.active = True
return slot
return None
async def _wait_for_slot_delay(self, slot: Slot) -> None:
if slot.last_left is not None:
delay = slot.last_left + self._delay - time.time()
if delay > 0:
await asyncio.sleep(delay)
def _release_slot(self, slot: Slot) -> None:
slot.last_left = time.time()
slot.active = False
@asynccontextmanager
async def limit_crawl(self) -> AsyncIterator[None]:
slot: Slot
async with self._condition:
while True:
if found_slot := self._acquire_slot():
slot = found_slot
break
await self._condition.wait()
await self._wait_for_slot_delay(slot)
try:
yield
finally:
async with self._condition:
self._release_slot(slot)
self._condition.notify_all()
@asynccontextmanager
async def limit_download(self) -> AsyncIterator[None]:
slot: Slot
async with self._condition:
while True:
if self._downloads <= 0:
await self._condition.wait()
continue
if found_slot := self._acquire_slot():
slot = found_slot
self._downloads -= 1
break
await self._condition.wait()
await self._wait_for_slot_delay(slot)
try:
yield
finally:
async with self._condition:
self._release_slot(slot)
self._downloads += 1
self._condition.notify_all()

53
PFERD/update.py Normal file
View File

@@ -0,0 +1,53 @@
from dataclasses import dataclass
import ssl
from typing import Optional
import aiohttp
import certifi
from .version import NAME, VERSION
from .logging import log
@dataclass
class PferdUpdate:
release_url: str
version: str
def _build_session() -> aiohttp.ClientSession:
return aiohttp.ClientSession(
headers={"User-Agent": f"{NAME}/{VERSION}"},
connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
timeout=aiohttp.ClientTimeout(
total=15 * 60,
connect=10,
sock_connect=10,
sock_read=10,
)
)
async def check_for_updates() -> None:
if new_version := await get_newer_version():
log.warn(
f"{NAME} version out of date. "
+ f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
)
log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
else:
log.explain("No update found")
async def get_newer_version() -> Optional[PferdUpdate]:
async with _build_session() as session:
async with session.get(
"https://api.github.com/repos/Garmelon/Pferd/releases/latest",
headers={"Accept": "application/vnd.github+json"}
) as response:
release_information = await response.json()
tag_name: str = release_information["tag_name"]
tag_name = tag_name.removeprefix("v")
if VERSION == tag_name:
return None
return PferdUpdate(release_url=release_information["html_url"], version=tag_name)

View File

@@ -92,32 +92,17 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str:
def str_path(path: PurePath) -> str:
"""
Turn a path into a string, in a platform-independent way.
This function always uses "/" as path separator, even on Windows.
"""
if not path.parts:
return "."
return "/".join(path.parts)
def fmt_path(path: PurePath) -> str:
"""
Turn a path into a delimited string.
This is useful if file or directory names contain weird characters like
newlines, leading/trailing whitespace or unprintable characters. This way,
they are escaped and visible to the user.
"""
return repr(str_path(path))
def fmt_real_path(path: Path) -> str:
"""
Like fmt_path, but resolves the path before converting it to a string.
"""
return fmt_path(path.absolute())
return repr(str(path.absolute()))
class ReusableAsyncContextManager(ABC, Generic[T]):

View File

@@ -1,2 +1,2 @@
NAME = "PFERD"
VERSION = "3.4.3"
VERSION = "3.4.1"

View File

@@ -30,10 +30,7 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
Unofficial packages are available for:
- [AUR](https://aur.archlinux.org/packages/pferd)
- [brew](https://formulae.brew.sh/formula/pferd)
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
- [PyPi](https://pypi.org/project/pferd)
See also PFERD's [repology page](https://repology.org/project/pferd/versions).

View File

@@ -14,4 +14,4 @@ pip install --editable .
# Installing tools and type hints
pip install --upgrade mypy flake8 autopep8 isort pyinstaller
mypy PFERD --install-types --non-interactive
pip install --upgrade types-chardet types-certifi

View File

@@ -11,7 +11,6 @@ install_requires =
rich>=11.0.0
keyring>=23.5.0
certifi>=2021.10.8
requests>=2.28.1
[options.entry_points]
console_scripts =