pferd/PFERD/transformer.py

# I'm sorry that this code has become a bit dense and unreadable. While
# reading, it is important to remember what True and False mean. I'd love to
# have some proper sum-types for the inputs and outputs, they'd make this code
# a lot easier to understand.

import ast
import re
from abc import ABC, abstractmethod
from pathlib import PurePath
from typing import Dict, Optional, Sequence, Union

from .logging import log
from .utils import fmt_path, str_path


class Rule(ABC):
    @abstractmethod
    def transform(self, path: PurePath) -> Union[PurePath, bool]:
        """
        Try to apply this rule to the path. Returns another path if the rule
        was successfully applied, True if the rule matched but resulted in an
        exclamation mark, and False if the rule didn't match at all.
        """

        pass


# These rules all use a Union[T, bool] for their right side. They are passed a
# T if the arrow's right side was a normal string, True if it was an
# exclamation mark and False if it was missing entirely.

class NormalRule(Rule):
    def __init__(self, left: PurePath, right: Union[PurePath, bool]):

        self._left = left
        self._right = right

    def _match_prefix(self, path: PurePath) -> Optional[PurePath]:
        left_parts = list(reversed(self._left.parts))
        path_parts = list(reversed(path.parts))

        if len(left_parts) > len(path_parts):
            return None

        while left_parts and path_parts:
            left_part = left_parts.pop()
            path_part = path_parts.pop()

            if left_part != path_part:
                return None

        if left_parts:
            return None

        path_parts.reverse()
        return PurePath(*path_parts)

    def transform(self, path: PurePath) -> Union[PurePath, bool]:
        if rest := self._match_prefix(path):
            if isinstance(self._right, bool):
                return self._right or path
            else:
                return self._right / rest

        return False


class ExactRule(Rule):
    def __init__(self, left: PurePath, right: Union[PurePath, bool]):
        self._left = left
        self._right = right

    def transform(self, path: PurePath) -> Union[PurePath, bool]:
        if path == self._left:
            if isinstance(self._right, bool):
                return self._right or path
            else:
                return self._right

        return False


class NameRule(Rule):
    def __init__(self, subrule: Rule):
        self._subrule = subrule

    def transform(self, path: PurePath) -> Union[PurePath, bool]:
        matched = False
        result = PurePath()

        for part in path.parts:
            part_result = self._subrule.transform(PurePath(part))
            if isinstance(part_result, PurePath):
                matched = True
                result /= part_result
            elif part_result:
                # If any subrule call ignores its path segment, the entire path
                # should be ignored
                return True
            else:
                # The subrule doesn't modify this segment, but maybe other
                # segments
                result /= part

        if matched:
            return result
        else:
            # The subrule has modified no segments, so this name version of it
            # doesn't match
            return False


class ReRule(Rule):
    def __init__(self, left: str, right: Union[str, bool]):
        self._left = left
        self._right = right

    def transform(self, path: PurePath) -> Union[PurePath, bool]:
        if match := re.fullmatch(self._left, str_path(path)):
            if isinstance(self._right, bool):
                return self._right or path

            vars: Dict[str, Union[str, int, float]] = {}

            # For some reason, mypy thinks that "groups" has type List[str].
            # But since elements of "match.groups()" can be None, mypy is
            # wrong.
            groups: Sequence[Optional[str]] = [match[0]] + list(match.groups())
            for i, group in enumerate(groups):
                if group is None:
                    continue

                vars[f"g{i}"] = group

                try:
                    vars[f"i{i}"] = int(group)
                except ValueError:
                    pass

                try:
                    vars[f"f{i}"] = float(group)
                except ValueError:
                    pass

            result = eval(f"f{self._right!r}", vars)
            return PurePath(result)

        return False


class RuleParseError(Exception):
    def __init__(self, line: "Line", reason: str):
        super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}")

        self.line = line
        self.reason = reason

    def pretty_print(self) -> None:
        log.error(f"Error parsing rule on line {self.line.line_nr}:")
        log.error_contd(self.line.line)
        spaces = " " * self.line.index
        log.error_contd(f"{spaces}^--- {self.reason}")


class Line:
    def __init__(self, line: str, line_nr: int):
        self._line = line
        self._line_nr = line_nr
        self._index = 0

    def get(self) -> Optional[str]:
        if self._index < len(self._line):
            return self._line[self._index]

        return None

    @property
    def line(self) -> str:
        return self._line

    @property
    def line_nr(self) -> int:
        return self._line_nr

    @property
    def index(self) -> int:
        return self._index

    @index.setter
    def index(self, index: int) -> None:
        self._index = index

    def advance(self) -> None:
        self._index += 1

    def expect(self, string: str) -> None:
        for char in string:
            if self.get() == char:
                self.advance()
            else:
                raise RuleParseError(self, f"Expected {char!r}")


QUOTATION_MARKS = {'"', "'"}


def parse_string_literal(line: Line) -> str:
    escaped = False

    # Points to first character of string literal
    start_index = line.index

    quotation_mark = line.get()
    if quotation_mark not in QUOTATION_MARKS:
        # This should never happen as long as this function is only called from
        # parse_string.
        raise RuleParseError(line, "Invalid quotation mark")
    line.advance()

    while c := line.get():
        if escaped:
            escaped = False
            line.advance()
        elif c == quotation_mark:
            line.advance()
            stop_index = line.index
            literal = line.line[start_index:stop_index]
            return ast.literal_eval(literal)
        elif c == "\\":
            escaped = True
            line.advance()
        else:
            line.advance()

    raise RuleParseError(line, "Expected end of string literal")


def parse_until_space_or_eol(line: Line) -> str:
    result = []
    while c := line.get():
        if c == " ":
            break
        result.append(c)
        line.advance()

    return "".join(result)


def parse_string(line: Line) -> Union[str, bool]:
    if line.get() in QUOTATION_MARKS:
        return parse_string_literal(line)
    else:
        string = parse_until_space_or_eol(line)
        if string == "!":
            return True
        return string


def parse_arrow(line: Line) -> str:
    line.expect("-")

    name = []
    while True:
        c = line.get()
        if not c:
            raise RuleParseError(line, "Expected rest of arrow")
        elif c == "-":
            line.advance()
            c = line.get()
            if not c:
                raise RuleParseError(line, "Expected rest of arrow")
            elif c == ">":
                line.advance()
                break  # End of arrow
            else:
                name.append("-")
                continue
        else:
            name.append(c)

        line.advance()

    return "".join(name)


def parse_whitespace(line: Line) -> None:
    line.expect(" ")
    while line.get() == " ":
        line.advance()


def parse_eol(line: Line) -> None:
    if line.get() is not None:
        raise RuleParseError(line, "Expected end of line")


def parse_rule(line: Line) -> Rule:
    # Parse left side
    leftindex = line.index
    left = parse_string(line)
    if isinstance(left, bool):
        line.index = leftindex
        raise RuleParseError(line, "Left side can't be '!'")
    leftpath = PurePath(left)

    # Parse arrow
    parse_whitespace(line)
    arrowindex = line.index
    arrowname = parse_arrow(line)

    # Parse right side
    if line.get():
        parse_whitespace(line)
        right = parse_string(line)
    else:
        right = False
    rightpath: Union[PurePath, bool]
    if isinstance(right, bool):
        rightpath = right
    else:
        rightpath = PurePath(right)

    parse_eol(line)

    # Dispatch
    if arrowname == "":
        return NormalRule(leftpath, rightpath)
    elif arrowname == "name":
        if len(leftpath.parts) > 1:
            line.index = leftindex
            raise RuleParseError(line, "SOURCE must be a single name, not multiple segments")
        return NameRule(ExactRule(leftpath, rightpath))
    elif arrowname == "exact":
        return ExactRule(leftpath, rightpath)
    elif arrowname == "re":
        return ReRule(left, right)
    elif arrowname == "name-re":
        return NameRule(ReRule(left, right))
    else:
        line.index = arrowindex + 1  # For nicer error message
        raise RuleParseError(line, f"Invalid arrow name {arrowname!r}")


class Transformer:
    def __init__(self, rules: str):
        """
        May throw a RuleParseException.
        """

        self._rules = []
        for i, line in enumerate(rules.split("\n")):
            line = line.strip()
            if line:
                rule = parse_rule(Line(line, i))
                self._rules.append((line, rule))

    def transform(self, path: PurePath) -> Optional[PurePath]:
        for i, (line, rule) in enumerate(self._rules):
            log.explain(f"Testing rule {i+1}: {line}")

            try:
                result = rule.transform(path)
            except Exception as e:
                log.warn(f"Error while testing rule {i+1}: {line}")
                log.warn_contd(str(e))
                continue

            if isinstance(result, PurePath):
                log.explain(f"Match found, transformed path to {fmt_path(result)}")
                return result
            elif result:  # Exclamation mark
                log.explain("Match found, path ignored")
                return None
            else:
                continue

        log.explain("No rule matched, path is unchanged")
        return path
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`# I'm sorry that this code has become a bit dense and unreadable. While`
			`# reading, it is important to remember what True and False mean. I'd love to`
			`# have some proper sum-types for the inputs and outputs, they'd make this code`
			`# a lot easier to understand.`

			`import ast`
Implement transformer 2021-04-29 09:51:25 +02:00			`import re`
			`from abc import ABC, abstractmethod`
Use PurePath instead of Path Path should only be used when we need to access the file system. For all other purposes (mainly crawling), we use PurePath instead since the paths don't correspond to paths in the local file system. 2021-04-29 16:52:00 +02:00			`from pathlib import PurePath`
Fix error when capturing group is None 2021-05-27 13:56:01 +02:00			`from typing import Dict, Optional, Sequence, Union`
Implement transformer 2021-04-29 09:51:25 +02:00
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`from .logging import log`
Always use '/' as path separator for regex rules Previously, regex-matching paths on windows would, in some cases, require four backslashes ('\\\\') to escape a single path separator. That's just too much. With this commit, regex transforms now use '/' instead of '\' as path separator, meaning rules can more easily be shared between platforms (although they are not guaranteed to be 100% compatible since on Windows, '\' is still recognized as a path separator). To make rules more intuitive to write, local relative paths are now also printed with '/' as path separator on Windows. Since Windows also accepts '/' as path separator, this change doesn't really affect other rules that parse their sides as paths. 2021-06-04 18:02:45 +02:00			`from .utils import fmt_path, str_path`
Improve transformer error handling 2021-05-22 22:38:56 +02:00
Implement transformer 2021-04-29 09:51:25 +02:00
			`class Rule(ABC):`
			`@abstractmethod`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def transform(self, path: PurePath) -> Union[PurePath, bool]:`
			`"""`
			`Try to apply this rule to the path. Returns another path if the rule`
			`was successfully applied, True if the rule matched but resulted in an`
			`exclamation mark, and False if the rule didn't match at all.`
			`"""`

Implement transformer 2021-04-29 09:51:25 +02:00			`pass`


Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`# These rules all use a Union[T, bool] for their right side. They are passed a`
			`# T if the arrow's right side was a normal string, True if it was an`
			`# exclamation mark and False if it was missing entirely.`

Implement transformer 2021-04-29 09:51:25 +02:00			`class NormalRule(Rule):`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def __init__(self, left: PurePath, right: Union[PurePath, bool]):`

Implement transformer 2021-04-29 09:51:25 +02:00			`self._left = left`
			`self._right = right`

Use PurePath instead of Path Path should only be used when we need to access the file system. For all other purposes (mainly crawling), we use PurePath instead since the paths don't correspond to paths in the local file system. 2021-04-29 16:52:00 +02:00			`def _match_prefix(self, path: PurePath) -> Optional[PurePath]:`
Implement transformer 2021-04-29 09:51:25 +02:00			`left_parts = list(reversed(self._left.parts))`
			`path_parts = list(reversed(path.parts))`

			`if len(left_parts) > len(path_parts):`
			`return None`

			`while left_parts and path_parts:`
			`left_part = left_parts.pop()`
			`path_part = path_parts.pop()`

			`if left_part != path_part:`
			`return None`

			`if left_parts:`
			`return None`

Fix normal arrow 2021-05-22 22:44:59 +02:00			`path_parts.reverse()`
Use PurePath instead of Path Path should only be used when we need to access the file system. For all other purposes (mainly crawling), we use PurePath instead since the paths don't correspond to paths in the local file system. 2021-04-29 16:52:00 +02:00			`return PurePath(*path_parts)`
Implement transformer 2021-04-29 09:51:25 +02:00
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def transform(self, path: PurePath) -> Union[PurePath, bool]:`
Implement transformer 2021-04-29 09:51:25 +02:00			`if rest := self._match_prefix(path):`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`if isinstance(self._right, bool):`
			`return self._right or path`
			`else:`
			`return self._right / rest`
Implement transformer 2021-04-29 09:51:25 +02:00
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`return False`
Implement transformer 2021-04-29 09:51:25 +02:00

			`class ExactRule(Rule):`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def __init__(self, left: PurePath, right: Union[PurePath, bool]):`
Implement transformer 2021-04-29 09:51:25 +02:00			`self._left = left`
			`self._right = right`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def transform(self, path: PurePath) -> Union[PurePath, bool]:`
Implement transformer 2021-04-29 09:51:25 +02:00			`if path == self._left:`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`if isinstance(self._right, bool):`
			`return self._right or path`
			`else:`
			`return self._right`
Implement transformer 2021-04-29 09:51:25 +02:00
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`return False`
Implement transformer 2021-04-29 09:51:25 +02:00

Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`class NameRule(Rule):`
			`def __init__(self, subrule: Rule):`
			`self._subrule = subrule`

			`def transform(self, path: PurePath) -> Union[PurePath, bool]:`
Implement new name arrows 2021-05-27 13:42:49 +02:00			`matched = False`
			`result = PurePath()`

			`for part in path.parts:`
			`part_result = self._subrule.transform(PurePath(part))`
			`if isinstance(part_result, PurePath):`
			`matched = True`
			`result /= part_result`
			`elif part_result:`
			`# If any subrule call ignores its path segment, the entire path`
			`# should be ignored`
			`return True`
			`else:`
			`# The subrule doesn't modify this segment, but maybe other`
			`# segments`
			`result /= part`

			`if matched:`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`return result`
Implement new name arrows 2021-05-27 13:42:49 +02:00			`else:`
			`# The subrule has modified no segments, so this name version of it`
			`# doesn't match`
			`return False`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00

Implement transformer 2021-04-29 09:51:25 +02:00			`class ReRule(Rule):`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def __init__(self, left: str, right: Union[str, bool]):`
Implement transformer 2021-04-29 09:51:25 +02:00			`self._left = left`
			`self._right = right`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def transform(self, path: PurePath) -> Union[PurePath, bool]:`
Always use '/' as path separator for regex rules Previously, regex-matching paths on windows would, in some cases, require four backslashes ('\\\\') to escape a single path separator. That's just too much. With this commit, regex transforms now use '/' instead of '\' as path separator, meaning rules can more easily be shared between platforms (although they are not guaranteed to be 100% compatible since on Windows, '\' is still recognized as a path separator). To make rules more intuitive to write, local relative paths are now also printed with '/' as path separator on Windows. Since Windows also accepts '/' as path separator, this change doesn't really affect other rules that parse their sides as paths. 2021-06-04 18:02:45 +02:00			`if match := re.fullmatch(self._left, str_path(path)):`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`if isinstance(self._right, bool):`
			`return self._right or path`

			`vars: Dict[str, Union[str, int, float]] = {}`
Implement transformer 2021-04-29 09:51:25 +02:00
Fix error when capturing group is None 2021-05-27 13:56:01 +02:00			`# For some reason, mypy thinks that "groups" has type List[str].`
			`# But since elements of "match.groups()" can be None, mypy is`
			`# wrong.`
			`groups: Sequence[Optional[str]] = [match[0]] + list(match.groups())`
Implement transformer 2021-04-29 09:51:25 +02:00			`for i, group in enumerate(groups):`
Fix error when capturing group is None 2021-05-27 13:56:01 +02:00			`if group is None:`
			`continue`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`vars[f"g{i}"] = group`

Implement transformer 2021-04-29 09:51:25 +02:00			`try:`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`vars[f"i{i}"] = int(group)`
Implement transformer 2021-04-29 09:51:25 +02:00			`except ValueError:`
			`pass`

			`try:`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`vars[f"f{i}"] = float(group)`
Implement transformer 2021-04-29 09:51:25 +02:00			`except ValueError:`
			`pass`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`result = eval(f"f{self._right!r}", vars)`
			`return PurePath(result)`
Implement transformer 2021-04-29 09:51:25 +02:00
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`return False`
Implement transformer 2021-04-29 09:51:25 +02:00

Improve transformer error handling 2021-05-22 22:38:56 +02:00			`class RuleParseError(Exception):`
			`def __init__(self, line: "Line", reason: str):`
			`super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}")`

			`self.line = line`
			`self.reason = reason`
Implement transformer 2021-04-29 09:51:25 +02:00
			`def pretty_print(self) -> None:`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`log.error(f"Error parsing rule on line {self.line.line_nr}:")`
			`log.error_contd(self.line.line)`
Implement transformer 2021-04-29 09:51:25 +02:00			`spaces = " " * self.line.index`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`log.error_contd(f"{spaces}^--- {self.reason}")`
Implement transformer 2021-04-29 09:51:25 +02:00

			`class Line:`
			`def __init__(self, line: str, line_nr: int):`
			`self._line = line`
			`self._line_nr = line_nr`
			`self._index = 0`

			`def get(self) -> Optional[str]:`
			`if self._index < len(self._line):`
			`return self._line[self._index]`

			`return None`

			`@property`
			`def line(self) -> str:`
			`return self._line`

			`@property`
Fix rule error messages 2021-05-25 15:47:09 +02:00			`def line_nr(self) -> int:`
			`return self._line_nr`
Implement transformer 2021-04-29 09:51:25 +02:00
			`@property`
			`def index(self) -> int:`
			`return self._index`

			`@index.setter`
			`def index(self, index: int) -> None:`
			`self._index = index`

			`def advance(self) -> None:`
			`self._index += 1`

			`def expect(self, string: str) -> None:`
			`for char in string:`
			`if self.get() == char:`
			`self.advance()`
			`else:`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`raise RuleParseError(self, f"Expected {char!r}")`
Implement transformer 2021-04-29 09:51:25 +02:00

			`QUOTATION_MARKS = {'"', "'"}`


			`def parse_string_literal(line: Line) -> str:`
			`escaped = False`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00
			`# Points to first character of string literal`
			`start_index = line.index`
Implement transformer 2021-04-29 09:51:25 +02:00
			`quotation_mark = line.get()`
			`if quotation_mark not in QUOTATION_MARKS:`
			`# This should never happen as long as this function is only called from`
			`# parse_string.`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`raise RuleParseError(line, "Invalid quotation mark")`
Implement transformer 2021-04-29 09:51:25 +02:00			`line.advance()`

			`while c := line.get():`
			`if escaped:`
			`escaped = False`
			`line.advance()`
			`elif c == quotation_mark:`
			`line.advance()`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`stop_index = line.index`
			`literal = line.line[start_index:stop_index]`
			`return ast.literal_eval(literal)`
Implement transformer 2021-04-29 09:51:25 +02:00			`elif c == "\\":`
			`escaped = True`
			`line.advance()`
			`else:`
			`line.advance()`

Improve transformer error handling 2021-05-22 22:38:56 +02:00			`raise RuleParseError(line, "Expected end of string literal")`
Implement transformer 2021-04-29 09:51:25 +02:00

			`def parse_until_space_or_eol(line: Line) -> str:`
			`result = []`
			`while c := line.get():`
			`if c == " ":`
			`break`
			`result.append(c)`
			`line.advance()`

			`return "".join(result)`


Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`def parse_string(line: Line) -> Union[str, bool]:`
Implement transformer 2021-04-29 09:51:25 +02:00			`if line.get() in QUOTATION_MARKS:`
			`return parse_string_literal(line)`
			`else:`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`string = parse_until_space_or_eol(line)`
			`if string == "!":`
			`return True`
			`return string`
Implement transformer 2021-04-29 09:51:25 +02:00

			`def parse_arrow(line: Line) -> str:`
			`line.expect("-")`

			`name = []`
			`while True:`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`c = line.get()`
			`if not c:`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`raise RuleParseError(line, "Expected rest of arrow")`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`elif c == "-":`
			`line.advance()`
			`c = line.get()`
			`if not c:`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`raise RuleParseError(line, "Expected rest of arrow")`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`elif c == ">":`
			`line.advance()`
			`break # End of arrow`
Implement transformer 2021-04-29 09:51:25 +02:00			`else:`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`name.append("-")`
Fix arrow parsing error messages 2021-05-22 22:39:29 +02:00			`continue`
Implement transformer 2021-04-29 09:51:25 +02:00			`else:`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`name.append(c)`

			`line.advance()`
Implement transformer 2021-04-29 09:51:25 +02:00
			`return "".join(name)`


Allow variable whitespace in arrow rules 2021-05-15 15:13:34 +02:00			`def parse_whitespace(line: Line) -> None:`
			`line.expect(" ")`
			`while line.get() == " ":`
			`line.advance()`


Fix rules not being parsed entirely 2021-05-25 15:42:46 +02:00			`def parse_eol(line: Line) -> None:`
			`if line.get() is not None:`
			`raise RuleParseError(line, "Expected end of line")`


Implement transformer 2021-04-29 09:51:25 +02:00			`def parse_rule(line: Line) -> Rule:`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`# Parse left side`
			`leftindex = line.index`
Implement transformer 2021-04-29 09:51:25 +02:00			`left = parse_string(line)`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`if isinstance(left, bool):`
			`line.index = leftindex`
Improve transformer error handling 2021-05-22 22:38:56 +02:00			`raise RuleParseError(line, "Left side can't be '!'")`
Implement new name arrows 2021-05-27 13:42:49 +02:00			`leftpath = PurePath(left)`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00
			`# Parse arrow`
Allow variable whitespace in arrow rules 2021-05-15 15:13:34 +02:00			`parse_whitespace(line)`
Implement transformer 2021-04-29 09:51:25 +02:00			`arrowindex = line.index`
			`arrowname = parse_arrow(line)`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`# Parse right side`
			`if line.get():`
Allow variable whitespace in arrow rules 2021-05-15 15:13:34 +02:00			`parse_whitespace(line)`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`right = parse_string(line)`
			`else:`
			`right = False`
			`rightpath: Union[PurePath, bool]`
			`if isinstance(right, bool):`
			`rightpath = right`
			`else:`
			`rightpath = PurePath(right)`

Fix rules not being parsed entirely 2021-05-25 15:42:46 +02:00			`parse_eol(line)`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`# Dispatch`
Implement transformer 2021-04-29 09:51:25 +02:00			`if arrowname == "":`
Implement new name arrows 2021-05-27 13:42:49 +02:00			`return NormalRule(leftpath, rightpath)`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`elif arrowname == "name":`
Implement new name arrows 2021-05-27 13:42:49 +02:00			`if len(leftpath.parts) > 1:`
			`line.index = leftindex`
			`raise RuleParseError(line, "SOURCE must be a single name, not multiple segments")`
			`return NameRule(ExactRule(leftpath, rightpath))`
Implement transformer 2021-04-29 09:51:25 +02:00			`elif arrowname == "exact":`
Implement new name arrows 2021-05-27 13:42:49 +02:00			`return ExactRule(leftpath, rightpath)`
Implement transformer 2021-04-29 09:51:25 +02:00			`elif arrowname == "re":`
			`return ReRule(left, right)`
Add name variants for all arrows 2021-05-15 15:06:45 +02:00			`elif arrowname == "name-re":`
			`return NameRule(ReRule(left, right))`
Implement transformer 2021-04-29 09:51:25 +02:00			`else:`
			`line.index = arrowindex + 1 # For nicer error message`
Fix arrow parsing error messages 2021-05-22 22:39:29 +02:00			`raise RuleParseError(line, f"Invalid arrow name {arrowname!r}")`
Implement transformer 2021-04-29 09:51:25 +02:00

			`class Transformer:`
			`def __init__(self, rules: str):`
			`"""`
			`May throw a RuleParseException.`
			`"""`

			`self._rules = []`
			`for i, line in enumerate(rules.split("\n")):`
			`line = line.strip()`
			`if line:`
Explain crawling and partially explain downloading 2021-05-22 22:39:57 +02:00			`rule = parse_rule(Line(line, i))`
			`self._rules.append((line, rule))`
Implement transformer 2021-04-29 09:51:25 +02:00
Use PurePath instead of Path Path should only be used when we need to access the file system. For all other purposes (mainly crawling), we use PurePath instead since the paths don't correspond to paths in the local file system. 2021-04-29 16:52:00 +02:00			`def transform(self, path: PurePath) -> Optional[PurePath]:`
Explain crawling and partially explain downloading 2021-05-22 22:39:57 +02:00			`for i, (line, rule) in enumerate(self._rules):`
Number rules starting at 1 2021-05-23 10:44:18 +02:00			`log.explain(f"Testing rule {i+1}: {line}")`
Explain crawling and partially explain downloading 2021-05-22 22:39:57 +02:00
Fix error when capturing group is None 2021-05-27 13:56:01 +02:00			`try:`
			`result = rule.transform(path)`
			`except Exception as e:`
			`log.warn(f"Error while testing rule {i+1}: {line}")`
			`log.warn_contd(str(e))`
			`continue`

Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`if isinstance(result, PurePath):`
Improve transformer explain wording 2021-05-23 11:45:14 +02:00			`log.explain(f"Match found, transformed path to {fmt_path(result)}")`
Implement transformer 2021-04-29 09:51:25 +02:00			`return result`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`elif result: # Exclamation mark`
Improve transformer explain wording 2021-05-23 11:45:14 +02:00			`log.explain("Match found, path ignored")`
Elaborate on transforms and implement changes 2021-04-29 20:13:46 +02:00			`return None`
			`else:`
			`continue`
Implement transformer 2021-04-29 09:51:25 +02:00
Explain crawling and partially explain downloading 2021-05-22 22:39:57 +02:00			`log.explain("No rule matched, path is unchanged")`
Make crawlers use transformers 2021-05-15 14:03:15 +02:00			`return path`