Clean up logging

Paths are now (hopefully) logged consistently across all crawlers
This commit is contained in:
Joscha 2021-05-23 11:30:16 +02:00
parent c88f20859a
commit 803e5628a2
8 changed files with 95 additions and 56 deletions

View File

@ -8,7 +8,7 @@ from typing import Any, List, NoReturn, Optional, Tuple
from rich.markup import escape
from .logging import log
from .utils import prompt_yes_no
from .utils import fmt_real_path, prompt_yes_no
class ConfigLoadError(Exception):
@ -17,7 +17,7 @@ class ConfigLoadError(Exception):
"""
def __init__(self, path: Path, reason: str):
super().__init__(f"Failed to load config from {path}")
super().__init__(f"Failed to load config from {fmt_real_path(path)}")
self.path = path
self.reason = reason
@ -36,7 +36,7 @@ class ConfigOptionError(Exception):
class ConfigDumpError(Exception):
def __init__(self, path: Path, reason: str):
super().__init__(f"Failed to dump config to {path}")
super().__init__(f"Failed to dump config to {fmt_real_path(path)}")
self.path = path
self.reason = reason
@ -105,7 +105,7 @@ class Config:
else:
log.explain("Using default path")
path = Config._default_path()
log.explain(f"Loading {str(path)!r}")
log.explain(f"Loading {fmt_real_path(path)}")
# Using config.read_file instead of config.read because config.read
# would just ignore a missing file and carry on.
@ -130,8 +130,8 @@ class Config:
log.explain("Using default path")
path = self._default_path()
log.explain(f"Dumping to {str(path.absolute())!r}")
log.print(f"[bold bright_cyan]Dumping[/] to {escape(repr(str(path.absolute())))}")
log.explain(f"Dumping to {fmt_real_path(path)}")
log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}")
try:
path.parent.mkdir(parents=True, exist_ok=True)

View File

@ -12,7 +12,7 @@ from .logging import ProgressBar, log
from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
from .report import MarkConflictError, MarkDuplicateError
from .transformer import Transformer
from .utils import ReusableAsyncContextManager
from .utils import ReusableAsyncContextManager, fmt_path
class CrawlWarning(Exception):
@ -217,7 +217,7 @@ class Crawler(ABC):
)
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
log.explain_topic(f"Decision: Crawl {path}")
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
if self._transformer.transform(path) is None:
log.explain("Answer: No")
@ -225,7 +225,7 @@ class Crawler(ABC):
log.explain("Answer: Yes")
desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}"
desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(path))}"
return CrawlToken(self._limiter, desc)
async def download(
@ -235,7 +235,7 @@ class Crawler(ABC):
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
) -> Optional[DownloadToken]:
log.explain_topic(f"Decision: Download {path}")
log.explain_topic(f"Decision: Download {fmt_path(path)}")
transformed_path = self._transformer.transform(path)
if transformed_path is None:
@ -249,7 +249,7 @@ class Crawler(ABC):
log.explain("Answer: Yes")
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}"
desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(path))}"
return DownloadToken(self._limiter, fs_token, desc)
async def _cleanup(self) -> None:

View File

@ -1,13 +1,11 @@
import asyncio
import re
from pathlib import PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
import aiohttp
from aiohttp import hdrs
from bs4 import BeautifulSoup, Tag
from rich.markup import escape
from PFERD.authenticators import Authenticator
from PFERD.config import Config
@ -17,6 +15,7 @@ from PFERD.logging import ProgressBar, log
from PFERD.output_dir import FileSink, Redownload
from PFERD.utils import soupify, url_set_query_param
from ...utils import fmt_path
from .file_templates import link_template_plain, link_template_rich
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
@ -86,10 +85,10 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
last_exception = e
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
last_exception = e
log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}")
log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}")
if last_exception:
message = f"Error in I/O Operation: {escape(str(last_exception))}"
message = f"Error in I/O Operation: {last_exception}"
raise CrawlWarning(message) from last_exception
raise CrawlError("Impossible return in ilias _iorepeat")
@ -162,7 +161,7 @@ class KitIliasWebCrawler(HttpCrawler):
log.explain_topic("Inferred crawl target: Personal desktop")
await self._crawl_desktop()
else:
log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}")
log.explain_topic(f"Inferred crawl target: URL {self._target}")
await self._crawl_url(self._target)
async def _crawl_course(self, course_id: int) -> None:
@ -190,9 +189,7 @@ class KitIliasWebCrawler(HttpCrawler):
if expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"):
raise CrawlError(
"Invalid course id? I didn't find anything looking like a course"
)
raise CrawlError("Invalid course id? Didn't find anything looking like a course")
# Duplicated code, but the root page is special - we want to void fetching it twice!
page = IliasPage(soup, url, None)
@ -236,7 +233,7 @@ class KitIliasWebCrawler(HttpCrawler):
if element.type == IliasElementType.FILE:
await self._download_file(element, element_path)
elif element.type == IliasElementType.FORUM:
log.explain_topic(f"Decision: Crawl {escape(str(element_path))}")
log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
log.explain("Forums are not supported")
log.explain("Answer: No")
elif element.type == IliasElementType.LINK:

View File

@ -3,11 +3,11 @@ from pathlib import PurePath
from typing import Optional
import aiohttp
from rich.markup import escape
from .config import Config
from .crawler import Crawler, CrawlerSection
from .logging import log
from .utils import fmt_real_path
from .version import NAME, VERSION
@ -59,14 +59,14 @@ class HttpCrawler(Crawler):
async def _save_cookies(self) -> None:
log.explain_topic("Saving cookies")
if not self._current_cookie_jar:
log.explain("No cookie jar - save aborted")
log.explain("No cookie jar, save aborted")
return
try:
self._current_cookie_jar.save(self._cookie_jar_path)
log.explain(f"Cookies saved to {escape(str(self.COOKIE_FILE))}")
log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
except Exception:
log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}")
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
async def run(self) -> None:
self._current_cookie_jar = aiohttp.CookieJar()

View File

@ -112,6 +112,10 @@ class Log:
self.print(line)
def print(self, text: str) -> None:
"""
Print a normal message. Allows markup.
"""
if self._progress_suspended:
self._lines.append(text)
else:
@ -120,12 +124,24 @@ class Log:
# TODO Print errors (and warnings?) to stderr
def warn(self, text: str) -> None:
"""
Print a warning message. Allows no markup.
"""
self.print(f"[bold bright_red]Warning[/] {escape(text)}")
def error(self, text: str) -> None:
"""
Print an error message. Allows no markup.
"""
self.print(f"[bold bright_red]Error[/] [red]{escape(text)}")
def error_contd(self, text: str) -> None:
"""
Print further lines of an error message. Allows no markup.
"""
self.print(f"[red]{escape(text)}")
def unexpected_exception(self) -> None:
@ -153,18 +169,34 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
""".strip())
def explain_topic(self, text: str) -> None:
"""
Print a top-level explain text. Allows no markup.
"""
if self.output_explain:
self.print(f"[cyan]{escape(text)}")
def explain(self, text: str) -> None:
"""
Print an indented explain text. Allows no markup.
"""
if self.output_explain:
self.print(f" {escape(text)}")
def action(self, text: str) -> None:
"""
Print a status update while crawling. Allows markup.
"""
if self.output_action:
self.print(text)
def report(self, text: str) -> None:
"""
Print a report after crawling. Allows markup.
"""
if self.output_report:
self.print(text)

View File

@ -14,7 +14,7 @@ from rich.markup import escape
from .logging import log
from .report import Report
from .utils import ReusableAsyncContextManager, prompt_yes_no
from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no
SUFFIX_CHARS = string.ascii_lowercase + string.digits
SUFFIX_LENGTH = 6
@ -143,7 +143,7 @@ class OutputDirectory:
self._report = Report()
def prepare(self) -> None:
log.explain_topic(f"Creating base directory at {str(self._root.absolute())!r}")
log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}")
try:
self._root.mkdir(parents=True, exist_ok=True)
@ -159,9 +159,9 @@ class OutputDirectory:
"""
if ".." in path.parts:
raise OutputDirError(f"Forbidden segment '..' in path {path}")
raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}")
if "." in path.parts:
raise OutputDirError(f"Forbidden segment '.' in path {path}")
raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}")
return self._root / path
def _should_download(
@ -213,7 +213,7 @@ class OutputDirectory:
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output():
prompt = f"Replace {path} with remote file?"
prompt = f"Replace {fmt_path(path)} with remote file?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
@ -232,7 +232,7 @@ class OutputDirectory:
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output():
prompt = f"Recursively delete {path} and replace with remote file?"
prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
@ -252,7 +252,7 @@ class OutputDirectory:
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output():
prompt = f"Delete {parent} so remote file {path} can be downloaded?"
prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
@ -271,7 +271,7 @@ class OutputDirectory:
) -> bool:
if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output():
prompt = f"Delete {path}?"
prompt = f"Delete {fmt_path(path)}?"
return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST:
return False
@ -386,10 +386,10 @@ class OutputDirectory:
self._update_metadata(info)
if changed:
log.action(f"[bold bright_yellow]Changed[/] {escape(str(info.path))}")
log.action(f"[bold bright_yellow]Changed[/] {escape(fmt_path(info.path))}")
self._report.change_file(info.path)
else:
log.action(f"[bold bright_green]Added[/] {escape(str(info.path))}")
log.action(f"[bold bright_green]Added[/] {escape(fmt_path(info.path))}")
self._report.add_file(info.path)
async def cleanup(self) -> None:
@ -419,7 +419,7 @@ class OutputDirectory:
if await self._conflict_delete_lf(self._on_conflict, pure):
try:
path.unlink()
log.action(f"[bold bright_magenta]Deleted[/] {escape(str(path))}")
log.action(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(path))}")
self._report.delete_file(pure)
except OSError:
pass

View File

@ -10,6 +10,7 @@ from pathlib import PurePath
from typing import Dict, Optional, Union
from .logging import log
from .utils import fmt_path
class Rule(ABC):
@ -327,7 +328,7 @@ class Transformer:
result = rule.transform(path)
if isinstance(result, PurePath):
log.explain(f"Match! Transformed to {result}")
log.explain(f"Match! Transformed to {fmt_path(result)}")
return result
elif result: # Exclamation mark
log.explain("Match! Ignored")

View File

@ -4,6 +4,7 @@ import sys
import threading
from abc import ABC, abstractmethod
from contextlib import AsyncExitStack
from pathlib import Path, PurePath
from types import TracebackType
from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit
@ -34,6 +35,30 @@ async def agetpass(prompt: str) -> str:
return await in_daemon_thread(lambda: getpass.getpass(prompt))
async def prompt_yes_no(query: str, default: Optional[bool]) -> bool:
"""
Asks the user a yes/no question and returns their choice.
"""
if default is True:
query += " [Y/n] "
elif default is False:
query += " [y/N] "
else:
query += " [y/n] "
while True:
response = (await ainput(query)).strip().lower()
if response == "y":
return True
elif response == "n":
return False
elif response == "" and default is not None:
return default
print("Please answer with 'y' or 'n'.")
def soupify(data: bytes) -> bs4.BeautifulSoup:
"""
Parses HTML to a beautifulsoup object.
@ -66,28 +91,12 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str:
return result
async def prompt_yes_no(query: str, default: Optional[bool]) -> bool:
"""
Asks the user a yes/no question and returns their choice.
"""
def fmt_path(path: PurePath) -> str:
return repr(str(path))
if default is True:
query += " [Y/n] "
elif default is False:
query += " [y/N] "
else:
query += " [y/n] "
while True:
response = (await ainput(query)).strip().lower()
if response == "y":
return True
elif response == "n":
return False
elif response == "" and default is not None:
return default
print("Please answer with 'y' or 'n'.")
def fmt_real_path(path: Path) -> str:
return repr(str(path.absolute()))
class ReusableAsyncContextManager(ABC, Generic[T]):