Clean up logging

Paths are now (hopefully) logged consistently across all crawlers
This commit is contained in:
Joscha 2021-05-23 11:30:16 +02:00
parent c88f20859a
commit 803e5628a2
8 changed files with 95 additions and 56 deletions

View File

@ -8,7 +8,7 @@ from typing import Any, List, NoReturn, Optional, Tuple
from rich.markup import escape from rich.markup import escape
from .logging import log from .logging import log
from .utils import prompt_yes_no from .utils import fmt_real_path, prompt_yes_no
class ConfigLoadError(Exception): class ConfigLoadError(Exception):
@ -17,7 +17,7 @@ class ConfigLoadError(Exception):
""" """
def __init__(self, path: Path, reason: str): def __init__(self, path: Path, reason: str):
super().__init__(f"Failed to load config from {path}") super().__init__(f"Failed to load config from {fmt_real_path(path)}")
self.path = path self.path = path
self.reason = reason self.reason = reason
@ -36,7 +36,7 @@ class ConfigOptionError(Exception):
class ConfigDumpError(Exception): class ConfigDumpError(Exception):
def __init__(self, path: Path, reason: str): def __init__(self, path: Path, reason: str):
super().__init__(f"Failed to dump config to {path}") super().__init__(f"Failed to dump config to {fmt_real_path(path)}")
self.path = path self.path = path
self.reason = reason self.reason = reason
@ -105,7 +105,7 @@ class Config:
else: else:
log.explain("Using default path") log.explain("Using default path")
path = Config._default_path() path = Config._default_path()
log.explain(f"Loading {str(path)!r}") log.explain(f"Loading {fmt_real_path(path)}")
# Using config.read_file instead of config.read because config.read # Using config.read_file instead of config.read because config.read
# would just ignore a missing file and carry on. # would just ignore a missing file and carry on.
@ -130,8 +130,8 @@ class Config:
log.explain("Using default path") log.explain("Using default path")
path = self._default_path() path = self._default_path()
log.explain(f"Dumping to {str(path.absolute())!r}") log.explain(f"Dumping to {fmt_real_path(path)}")
log.print(f"[bold bright_cyan]Dumping[/] to {escape(repr(str(path.absolute())))}") log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}")
try: try:
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)

View File

@ -12,7 +12,7 @@ from .logging import ProgressBar, log
from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
from .report import MarkConflictError, MarkDuplicateError from .report import MarkConflictError, MarkDuplicateError
from .transformer import Transformer from .transformer import Transformer
from .utils import ReusableAsyncContextManager from .utils import ReusableAsyncContextManager, fmt_path
class CrawlWarning(Exception): class CrawlWarning(Exception):
@ -217,7 +217,7 @@ class Crawler(ABC):
) )
async def crawl(self, path: PurePath) -> Optional[CrawlToken]: async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
log.explain_topic(f"Decision: Crawl {path}") log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
if self._transformer.transform(path) is None: if self._transformer.transform(path) is None:
log.explain("Answer: No") log.explain("Answer: No")
@ -225,7 +225,7 @@ class Crawler(ABC):
log.explain("Answer: Yes") log.explain("Answer: Yes")
desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}" desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(path))}"
return CrawlToken(self._limiter, desc) return CrawlToken(self._limiter, desc)
async def download( async def download(
@ -235,7 +235,7 @@ class Crawler(ABC):
redownload: Optional[Redownload] = None, redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None, on_conflict: Optional[OnConflict] = None,
) -> Optional[DownloadToken]: ) -> Optional[DownloadToken]:
log.explain_topic(f"Decision: Download {path}") log.explain_topic(f"Decision: Download {fmt_path(path)}")
transformed_path = self._transformer.transform(path) transformed_path = self._transformer.transform(path)
if transformed_path is None: if transformed_path is None:
@ -249,7 +249,7 @@ class Crawler(ABC):
log.explain("Answer: Yes") log.explain("Answer: Yes")
desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(path))}"
return DownloadToken(self._limiter, fs_token, desc) return DownloadToken(self._limiter, fs_token, desc)
async def _cleanup(self) -> None: async def _cleanup(self) -> None:

View File

@ -1,13 +1,11 @@
import asyncio import asyncio
import re import re
from pathlib import PurePath from pathlib import PurePath
# TODO In Python 3.9 and above, AsyncContextManager is deprecated
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
import aiohttp import aiohttp
from aiohttp import hdrs from aiohttp import hdrs
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from rich.markup import escape
from PFERD.authenticators import Authenticator from PFERD.authenticators import Authenticator
from PFERD.config import Config from PFERD.config import Config
@ -17,6 +15,7 @@ from PFERD.logging import ProgressBar, log
from PFERD.output_dir import FileSink, Redownload from PFERD.output_dir import FileSink, Redownload
from PFERD.utils import soupify, url_set_query_param from PFERD.utils import soupify, url_set_query_param
from ...utils import fmt_path
from .file_templates import link_template_plain, link_template_rich from .file_templates import link_template_plain, link_template_rich
from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
@ -86,10 +85,10 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
last_exception = e last_exception = e
except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc.
last_exception = e last_exception = e
log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}") log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}")
if last_exception: if last_exception:
message = f"Error in I/O Operation: {escape(str(last_exception))}" message = f"Error in I/O Operation: {last_exception}"
raise CrawlWarning(message) from last_exception raise CrawlWarning(message) from last_exception
raise CrawlError("Impossible return in ilias _iorepeat") raise CrawlError("Impossible return in ilias _iorepeat")
@ -162,7 +161,7 @@ class KitIliasWebCrawler(HttpCrawler):
log.explain_topic("Inferred crawl target: Personal desktop") log.explain_topic("Inferred crawl target: Personal desktop")
await self._crawl_desktop() await self._crawl_desktop()
else: else:
log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}") log.explain_topic(f"Inferred crawl target: URL {self._target}")
await self._crawl_url(self._target) await self._crawl_url(self._target)
async def _crawl_course(self, course_id: int) -> None: async def _crawl_course(self, course_id: int) -> None:
@ -190,9 +189,7 @@ class KitIliasWebCrawler(HttpCrawler):
if expected_id is not None: if expected_id is not None:
perma_link_element: Tag = soup.find(id="current_perma_link") perma_link_element: Tag = soup.find(id="current_perma_link")
if not perma_link_element or "crs_" not in perma_link_element.get("value"): if not perma_link_element or "crs_" not in perma_link_element.get("value"):
raise CrawlError( raise CrawlError("Invalid course id? Didn't find anything looking like a course")
"Invalid course id? I didn't find anything looking like a course"
)
# Duplicated code, but the root page is special - we want to void fetching it twice! # Duplicated code, but the root page is special - we want to void fetching it twice!
page = IliasPage(soup, url, None) page = IliasPage(soup, url, None)
@ -236,7 +233,7 @@ class KitIliasWebCrawler(HttpCrawler):
if element.type == IliasElementType.FILE: if element.type == IliasElementType.FILE:
await self._download_file(element, element_path) await self._download_file(element, element_path)
elif element.type == IliasElementType.FORUM: elif element.type == IliasElementType.FORUM:
log.explain_topic(f"Decision: Crawl {escape(str(element_path))}") log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
log.explain("Forums are not supported") log.explain("Forums are not supported")
log.explain("Answer: No") log.explain("Answer: No")
elif element.type == IliasElementType.LINK: elif element.type == IliasElementType.LINK:

View File

@ -3,11 +3,11 @@ from pathlib import PurePath
from typing import Optional from typing import Optional
import aiohttp import aiohttp
from rich.markup import escape
from .config import Config from .config import Config
from .crawler import Crawler, CrawlerSection from .crawler import Crawler, CrawlerSection
from .logging import log from .logging import log
from .utils import fmt_real_path
from .version import NAME, VERSION from .version import NAME, VERSION
@ -59,14 +59,14 @@ class HttpCrawler(Crawler):
async def _save_cookies(self) -> None: async def _save_cookies(self) -> None:
log.explain_topic("Saving cookies") log.explain_topic("Saving cookies")
if not self._current_cookie_jar: if not self._current_cookie_jar:
log.explain("No cookie jar - save aborted") log.explain("No cookie jar, save aborted")
return return
try: try:
self._current_cookie_jar.save(self._cookie_jar_path) self._current_cookie_jar.save(self._cookie_jar_path)
log.explain(f"Cookies saved to {escape(str(self.COOKIE_FILE))}") log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
except Exception: except Exception:
log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
async def run(self) -> None: async def run(self) -> None:
self._current_cookie_jar = aiohttp.CookieJar() self._current_cookie_jar = aiohttp.CookieJar()

View File

@ -112,6 +112,10 @@ class Log:
self.print(line) self.print(line)
def print(self, text: str) -> None: def print(self, text: str) -> None:
"""
Print a normal message. Allows markup.
"""
if self._progress_suspended: if self._progress_suspended:
self._lines.append(text) self._lines.append(text)
else: else:
@ -120,12 +124,24 @@ class Log:
# TODO Print errors (and warnings?) to stderr # TODO Print errors (and warnings?) to stderr
def warn(self, text: str) -> None: def warn(self, text: str) -> None:
"""
Print a warning message. Allows no markup.
"""
self.print(f"[bold bright_red]Warning[/] {escape(text)}") self.print(f"[bold bright_red]Warning[/] {escape(text)}")
def error(self, text: str) -> None: def error(self, text: str) -> None:
"""
Print an error message. Allows no markup.
"""
self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") self.print(f"[bold bright_red]Error[/] [red]{escape(text)}")
def error_contd(self, text: str) -> None: def error_contd(self, text: str) -> None:
"""
Print further lines of an error message. Allows no markup.
"""
self.print(f"[red]{escape(text)}") self.print(f"[red]{escape(text)}")
def unexpected_exception(self) -> None: def unexpected_exception(self) -> None:
@ -153,18 +169,34 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new
""".strip()) """.strip())
def explain_topic(self, text: str) -> None: def explain_topic(self, text: str) -> None:
"""
Print a top-level explain text. Allows no markup.
"""
if self.output_explain: if self.output_explain:
self.print(f"[cyan]{escape(text)}") self.print(f"[cyan]{escape(text)}")
def explain(self, text: str) -> None: def explain(self, text: str) -> None:
"""
Print an indented explain text. Allows no markup.
"""
if self.output_explain: if self.output_explain:
self.print(f" {escape(text)}") self.print(f" {escape(text)}")
def action(self, text: str) -> None: def action(self, text: str) -> None:
"""
Print a status update while crawling. Allows markup.
"""
if self.output_action: if self.output_action:
self.print(text) self.print(text)
def report(self, text: str) -> None: def report(self, text: str) -> None:
"""
Print a report after crawling. Allows markup.
"""
if self.output_report: if self.output_report:
self.print(text) self.print(text)

View File

@ -14,7 +14,7 @@ from rich.markup import escape
from .logging import log from .logging import log
from .report import Report from .report import Report
from .utils import ReusableAsyncContextManager, prompt_yes_no from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no
SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_CHARS = string.ascii_lowercase + string.digits
SUFFIX_LENGTH = 6 SUFFIX_LENGTH = 6
@ -143,7 +143,7 @@ class OutputDirectory:
self._report = Report() self._report = Report()
def prepare(self) -> None: def prepare(self) -> None:
log.explain_topic(f"Creating base directory at {str(self._root.absolute())!r}") log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}")
try: try:
self._root.mkdir(parents=True, exist_ok=True) self._root.mkdir(parents=True, exist_ok=True)
@ -159,9 +159,9 @@ class OutputDirectory:
""" """
if ".." in path.parts: if ".." in path.parts:
raise OutputDirError(f"Forbidden segment '..' in path {path}") raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}")
if "." in path.parts: if "." in path.parts:
raise OutputDirError(f"Forbidden segment '.' in path {path}") raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}")
return self._root / path return self._root / path
def _should_download( def _should_download(
@ -213,7 +213,7 @@ class OutputDirectory:
) -> bool: ) -> bool:
if on_conflict == OnConflict.PROMPT: if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output(): async with log.exclusive_output():
prompt = f"Replace {path} with remote file?" prompt = f"Replace {fmt_path(path)} with remote file?"
return await prompt_yes_no(prompt, default=False) return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST: elif on_conflict == OnConflict.LOCAL_FIRST:
return False return False
@ -232,7 +232,7 @@ class OutputDirectory:
) -> bool: ) -> bool:
if on_conflict == OnConflict.PROMPT: if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output(): async with log.exclusive_output():
prompt = f"Recursively delete {path} and replace with remote file?" prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?"
return await prompt_yes_no(prompt, default=False) return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST: elif on_conflict == OnConflict.LOCAL_FIRST:
return False return False
@ -252,7 +252,7 @@ class OutputDirectory:
) -> bool: ) -> bool:
if on_conflict == OnConflict.PROMPT: if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output(): async with log.exclusive_output():
prompt = f"Delete {parent} so remote file {path} can be downloaded?" prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?"
return await prompt_yes_no(prompt, default=False) return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST: elif on_conflict == OnConflict.LOCAL_FIRST:
return False return False
@ -271,7 +271,7 @@ class OutputDirectory:
) -> bool: ) -> bool:
if on_conflict == OnConflict.PROMPT: if on_conflict == OnConflict.PROMPT:
async with log.exclusive_output(): async with log.exclusive_output():
prompt = f"Delete {path}?" prompt = f"Delete {fmt_path(path)}?"
return await prompt_yes_no(prompt, default=False) return await prompt_yes_no(prompt, default=False)
elif on_conflict == OnConflict.LOCAL_FIRST: elif on_conflict == OnConflict.LOCAL_FIRST:
return False return False
@ -386,10 +386,10 @@ class OutputDirectory:
self._update_metadata(info) self._update_metadata(info)
if changed: if changed:
log.action(f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") log.action(f"[bold bright_yellow]Changed[/] {escape(fmt_path(info.path))}")
self._report.change_file(info.path) self._report.change_file(info.path)
else: else:
log.action(f"[bold bright_green]Added[/] {escape(str(info.path))}") log.action(f"[bold bright_green]Added[/] {escape(fmt_path(info.path))}")
self._report.add_file(info.path) self._report.add_file(info.path)
async def cleanup(self) -> None: async def cleanup(self) -> None:
@ -419,7 +419,7 @@ class OutputDirectory:
if await self._conflict_delete_lf(self._on_conflict, pure): if await self._conflict_delete_lf(self._on_conflict, pure):
try: try:
path.unlink() path.unlink()
log.action(f"[bold bright_magenta]Deleted[/] {escape(str(path))}") log.action(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(path))}")
self._report.delete_file(pure) self._report.delete_file(pure)
except OSError: except OSError:
pass pass

View File

@ -10,6 +10,7 @@ from pathlib import PurePath
from typing import Dict, Optional, Union from typing import Dict, Optional, Union
from .logging import log from .logging import log
from .utils import fmt_path
class Rule(ABC): class Rule(ABC):
@ -327,7 +328,7 @@ class Transformer:
result = rule.transform(path) result = rule.transform(path)
if isinstance(result, PurePath): if isinstance(result, PurePath):
log.explain(f"Match! Transformed to {result}") log.explain(f"Match! Transformed to {fmt_path(result)}")
return result return result
elif result: # Exclamation mark elif result: # Exclamation mark
log.explain("Match! Ignored") log.explain("Match! Ignored")

View File

@ -4,6 +4,7 @@ import sys
import threading import threading
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from contextlib import AsyncExitStack from contextlib import AsyncExitStack
from pathlib import Path, PurePath
from types import TracebackType from types import TracebackType
from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit
@ -34,6 +35,30 @@ async def agetpass(prompt: str) -> str:
return await in_daemon_thread(lambda: getpass.getpass(prompt)) return await in_daemon_thread(lambda: getpass.getpass(prompt))
async def prompt_yes_no(query: str, default: Optional[bool]) -> bool:
"""
Asks the user a yes/no question and returns their choice.
"""
if default is True:
query += " [Y/n] "
elif default is False:
query += " [y/N] "
else:
query += " [y/n] "
while True:
response = (await ainput(query)).strip().lower()
if response == "y":
return True
elif response == "n":
return False
elif response == "" and default is not None:
return default
print("Please answer with 'y' or 'n'.")
def soupify(data: bytes) -> bs4.BeautifulSoup: def soupify(data: bytes) -> bs4.BeautifulSoup:
""" """
Parses HTML to a beautifulsoup object. Parses HTML to a beautifulsoup object.
@ -66,28 +91,12 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str:
return result return result
async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: def fmt_path(path: PurePath) -> str:
""" return repr(str(path))
Asks the user a yes/no question and returns their choice.
"""
if default is True:
query += " [Y/n] "
elif default is False:
query += " [y/N] "
else:
query += " [y/n] "
while True: def fmt_real_path(path: Path) -> str:
response = (await ainput(query)).strip().lower() return repr(str(path.absolute()))
if response == "y":
return True
elif response == "n":
return False
elif response == "" and default is not None:
return default
print("Please answer with 'y' or 'n'.")
class ReusableAsyncContextManager(ABC, Generic[T]): class ReusableAsyncContextManager(ABC, Generic[T]):