Bump version to 3.2.0

Catch non-critical exceptions at crawler top level
Update changelog
2023-12-21 10:23:01 +01:00 · 2021-08-04 18:27:26 +00:00 · 2021-07-13 15:42:11 +02:00 · 2021-07-07 15:23:58 +02:00 · 2021-07-06 17:45:48 +02:00 · 2021-07-06 16:15:25 +02:00
9 changed files with 211 additions and 71 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,22 @@ ambiguous situations.

 ## Unreleased

+## 3.2.0 - 2021-08-04
+
+### Added
+- `--skip` command line option
+- Support for ILIAS booking objects
+
+### Changed
+- Using multiple path segments on left side of `-name->` now results in an
+  error. This was already forbidden by the documentation but silently accepted
+  by PFERD.
+- More consistent path printing in some `--explain` messages
+
+### Fixed
+- Nondeterministic name deduplication due to ILIAS reordering elements
+- More exceptions are handled properly
+
 ## 3.1.0 - 2021-06-13

 If your config file doesn't do weird things with transforms, it should continue
--- a/PFERD/main.py
+++ b/PFERD/main.py
@ -116,7 +116,7 @@ def main() -> None:
        sys.exit()

    try:
-        pferd = Pferd(config, args.crawler)
+        pferd = Pferd(config, args.crawler, args.skip)
    except PferdLoadError as e:
        log.unlock()
        log.error(str(e))
--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@ -181,6 +181,14 @@ PARSER.add_argument(
    help="only execute a single crawler."
    " Can be specified multiple times to execute multiple crawlers"
 )
+PARSER.add_argument(
+    "--skip", "-S",
+    action="append",
+    type=str,
+    metavar="NAME",
+    help="don't execute this particular crawler."
+    " Can be specified multiple times to skip multiple crawlers"
+)
 PARSER.add_argument(
    "--working-dir",
    type=Path,
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@ -56,7 +56,7 @@ def noncritical(f: Wrapped) -> Wrapped:
    return wrapper  # type: ignore


-AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
+AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])


 def anoncritical(f: AWrapped) -> AWrapped:
@ -72,14 +72,14 @@ def anoncritical(f: AWrapped) -> AWrapped:
    Warning: Must only be applied to member functions of the Crawler class!
    """

-    async def wrapper(*args: Any, **kwargs: Any) -> None:
+    async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
        if not (args and isinstance(args[0], Crawler)):
            raise RuntimeError("@anoncritical must only applied to Crawler methods")

        crawler = args[0]

        try:
-            await f(*args, **kwargs)
+            return await f(*args, **kwargs)
        except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e:
            log.warn(str(e))
            crawler.error_free = False
@ -87,6 +87,8 @@ def anoncritical(f: AWrapped) -> AWrapped:
            crawler.error_free = False
            raise

+        return None
+
    return wrapper  # type: ignore


@ -318,6 +320,7 @@ class Crawler(ABC):
            log.explain("Warnings or errors occurred during this run")
            log.explain("Answer: No")

+    @anoncritical
    async def run(self) -> None:
        """
        Start the crawling process. Call this function if you want to use a
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -22,6 +22,7 @@ class IliasElementType(Enum):
    FOLDER = "folder"
    FORUM = "forum"
    LINK = "link"
+    BOOKING = "booking"
    MEETING = "meeting"
    VIDEO = "video"
    VIDEO_PLAYER = "video_player"
@ -37,6 +38,17 @@ class IliasPageElement:
    mtime: Optional[datetime] = None
    description: Optional[str] = None

+    def id(self) -> str:
+        regexes = [r"eid=(?P<id>[0-9a-z\-]+)", r"file_(?P<id>\d+)", r"ref_id=(?P<id>\d+)"]
+
+        for regex in regexes:
+            if match := re.search(regex, self.url):
+                return match.groupdict()["id"]
+
+        # Fall back to URL
+        log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.")
+        return self.url
+

 class IliasPage:

@ -490,6 +502,9 @@ class IliasPage:
        if str(img_tag["src"]).endswith("icon_webr.svg"):
            return IliasElementType.LINK

+        if str(img_tag["src"]).endswith("icon_book.svg"):
+            return IliasElementType.BOOKING
+
        if str(img_tag["src"]).endswith("frm.svg"):
            return IliasElementType.FORUM

--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -12,7 +12,7 @@ from ...config import Config
 from ...logging import ProgressBar, log
 from ...output_dir import FileSink, Redownload
 from ...utils import fmt_path, soupify, url_set_query_param
-from ..crawler import CrawlError, CrawlWarning, anoncritical
+from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical
 from ..http_crawler import HttpCrawler, HttpCrawlerSection
 from .file_templates import Links
 from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement
@ -81,17 +81,16 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([
    IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
 ])

-AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]])
+AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]])


 def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]:
    def decorator(f: AWrapped) -> AWrapped:
-        async def wrapper(*args: Any, **kwargs: Any) -> None:
+        async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
            last_exception: Optional[BaseException] = None
            for round in range(attempts):
                try:
-                    await f(*args, **kwargs)
-                    return
+                    return await f(*args, **kwargs)
                except aiohttp.ContentTypeError:  # invalid content type
                    raise CrawlWarning("ILIAS returned an invalid content type")
                except aiohttp.TooManyRedirects:
@ -230,17 +229,34 @@ instance's greatest bottleneck.

        # Fill up our task list with the found elements
        await gather_elements()
-        tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements]
+
+        elements.sort(key=lambda e: e.id())
+
+        tasks: List[Awaitable[None]] = []
+        for element in elements:
+            if handle := await self._handle_ilias_element(PurePath("."), element):
+                tasks.append(asyncio.create_task(handle))

        # And execute them
        await self.gather(tasks)

-    async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None:
+    async def _handle_ilias_page(
+        self,
+        url: str,
+        parent: IliasPageElement,
+        path: PurePath,
+    ) -> Optional[Awaitable[None]]:
        maybe_cl = await self.crawl(path)
        if not maybe_cl:
-            return
-        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
+            return None
+        return self._crawl_ilias_page(url, parent, maybe_cl)

+    async def _crawl_ilias_page(
+        self,
+        url: str,
+        parent: IliasPageElement,
+        cl: CrawlToken,
+    ) -> None:
        elements: List[IliasPageElement] = []

        @_iorepeat(3, "crawling folder")
@ -252,7 +268,7 @@ instance's greatest bottleneck.

                while next_stage_url:
                    soup = await self._get_page(next_stage_url)
-                    log.explain_topic(f"Parsing HTML page for {fmt_path(path)}")
+                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
                    log.explain(f"URL: {next_stage_url}")
                    page = IliasPage(soup, next_stage_url, current_parent)
                    if next_element := page.get_next_stage_element():
@ -265,7 +281,13 @@ instance's greatest bottleneck.

        # Fill up our task list with the found elements
        await gather_elements()
-        tasks = [self._handle_ilias_element(cl.path, element) for element in elements]
+
+        elements.sort(key=lambda e: e.id())
+
+        tasks: List[Awaitable[None]] = []
+        for element in elements:
+            if handle := await self._handle_ilias_element(cl.path, element):
+                tasks.append(asyncio.create_task(handle))

        # And execute them
        await self.gather(tasks)
@ -274,7 +296,11 @@ instance's greatest bottleneck.
    # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical.
    # If that happens we will be terminated as anoncritical doesn't tream them as non-critical.
    @_wrap_io_in_warning("handling ilias element")
-    async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None:
+    async def _handle_ilias_element(
+        self,
+        parent_path: PurePath,
+        element: IliasPageElement,
+    ) -> Optional[Awaitable[None]]:
        element_path = PurePath(parent_path, element.name)

        if element.type in _VIDEO_ELEMENTS:
@ -282,35 +308,43 @@ instance's greatest bottleneck.
            if not self._videos:
                log.explain("Video crawling is disabled")
                log.explain("Answer: no")
-                return
+                return None
            else:
                log.explain("Video crawling is enabled")
                log.explain("Answer: yes")

        if element.type == IliasElementType.FILE:
-            await self._download_file(element, element_path)
+            return await self._handle_file(element, element_path)
        elif element.type == IliasElementType.FORUM:
            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
            log.explain("Forums are not supported")
            log.explain("Answer: No")
+            return None
        elif element.type == IliasElementType.TEST:
            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
            log.explain("Tests contain no relevant files")
            log.explain("Answer: No")
+            return None
        elif element.type == IliasElementType.LINK:
-            await self._download_link(element, element_path)
+            return await self._handle_link(element, element_path)
+        elif element.type == IliasElementType.BOOKING:
+            return await self._handle_booking(element, element_path)
        elif element.type == IliasElementType.VIDEO:
-            await self._download_file(element, element_path)
+            return await self._handle_file(element, element_path)
        elif element.type == IliasElementType.VIDEO_PLAYER:
-            await self._download_video(element, element_path)
+            return await self._handle_video(element, element_path)
        elif element.type in _DIRECTORY_PAGES:
-            await self._handle_ilias_page(element.url, element, element_path)
+            return await self._handle_ilias_page(element.url, element, element_path)
        else:
            # This will retry it a few times, failing everytime. It doesn't make any network
            # requests, so that's fine.
            raise CrawlWarning(f"Unknown element type: {element.type!r}")

-    async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None:
+    async def _handle_link(
+        self,
+        element: IliasPageElement,
+        element_path: PurePath,
+    ) -> Optional[Awaitable[None]]:
        log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
        log.explain(f"Links type is {self._links}")

@ -318,32 +352,72 @@ instance's greatest bottleneck.
        link_extension = self._links.extension()
        if not link_template_maybe or not link_extension:
            log.explain("Answer: No")
-            return
+            return None
        else:
            log.explain("Answer: Yes")
-        link_template = link_template_maybe
        element_path = element_path.with_name(element_path.name + link_extension)

        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
-            return
-        dl = maybe_dl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
+            return None

-        @_iorepeat(3, "resolving link")
-        async def impl() -> None:
-            async with dl as (bar, sink):
-                export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
-                real_url = await self._resolve_link_target(export_url)
+        return self._download_link(element, link_template_maybe, maybe_dl)

-                content = link_template
-                content = content.replace("{{link}}", real_url)
-                content = content.replace("{{name}}", element.name)
-                content = content.replace("{{description}}", str(element.description))
-                content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
-                sink.file.write(content.encode("utf-8"))
-                sink.done()
+    @_iorepeat(3, "resolving link")
+    async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
+        async with dl as (bar, sink):
+            export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
+            real_url = await self._resolve_link_target(export_url)
+            self._write_link_content(link_template, real_url, element.name, element.description, sink)

-        await impl()
+    def _write_link_content(
+        self,
+        link_template: str,
+        url: str,
+        name: str,
+        description: Optional[str],
+        sink: FileSink,
+    ) -> None:
+        content = link_template
+        content = content.replace("{{link}}", url)
+        content = content.replace("{{name}}", name)
+        content = content.replace("{{description}}", str(description))
+        content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
+        sink.file.write(content.encode("utf-8"))
+        sink.done()
+
+    async def _handle_booking(
+        self,
+        element: IliasPageElement,
+        element_path: PurePath,
+    ) -> Optional[Awaitable[None]]:
+        log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}")
+        log.explain(f"Links type is {self._links}")
+
+        link_template_maybe = self._links.template()
+        link_extension = self._links.extension()
+        if not link_template_maybe or not link_extension:
+            log.explain("Answer: No")
+            return None
+        else:
+            log.explain("Answer: Yes")
+        element_path = element_path.with_name(element_path.name + link_extension)
+
+        maybe_dl = await self.download(element_path, mtime=element.mtime)
+        if not maybe_dl:
+            return None
+
+        return self._download_booking(element, link_template_maybe, maybe_dl)
+
+    @_iorepeat(3, "resolving booking")
+    async def _download_booking(
+        self,
+        element: IliasPageElement,
+        link_template: str,
+        dl: DownloadToken,
+    ) -> None:
+        async with dl as (bar, sink):
+            self._write_link_content(link_template, element.url, element.name, element.description, sink)

    async def _resolve_link_target(self, export_url: str) -> str:
        async with self.session.get(export_url, allow_redirects=False) as resp:
@ -360,39 +434,43 @@ instance's greatest bottleneck.

        raise CrawlError("resolve_link_target failed even after authenticating")

-    async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None:
+    async def _handle_video(
+        self,
+        element: IliasPageElement,
+        element_path: PurePath,
+    ) -> Optional[Awaitable[None]]:
        # Videos will NOT be redownloaded - their content doesn't really change and they are chunky
        maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER)
        if not maybe_dl:
-            return
-        dl = maybe_dl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
+            return None

-        @_iorepeat(3, "downloading video")
-        async def impl() -> None:
-            assert dl  # The function is only reached when dl is not None
-            async with dl as (bar, sink):
-                page = IliasPage(await self._get_page(element.url), element.url, element)
-                real_element = page.get_child_elements()[0]
+        return self._download_video(element, maybe_dl)

-                log.explain(f"Streaming video from real url {real_element.url}")
+    @_iorepeat(3, "downloading video")
+    async def _download_video(self, element: IliasPageElement, dl: DownloadToken) -> None:
+        async with dl as (bar, sink):
+            page = IliasPage(await self._get_page(element.url), element.url, element)
+            real_element = page.get_child_elements()[0]

-                await self._stream_from_url(real_element.url, sink, bar, is_video=True)
+            log.explain(f"Streaming video from real url {real_element.url}")

-        await impl()
+            await self._stream_from_url(real_element.url, sink, bar, is_video=True)

-    async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None:
+    async def _handle_file(
+        self,
+        element: IliasPageElement,
+        element_path: PurePath,
+    ) -> Optional[Awaitable[None]]:
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
-            return
-        dl = maybe_dl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
+            return None
+        return self._download_file(element, maybe_dl)

-        @_iorepeat(3, "downloading file")
-        async def impl() -> None:
-            assert dl  # The function is only reached when dl is not None
-            async with dl as (bar, sink):
-                await self._stream_from_url(element.url, sink, bar, is_video=False)
-
-        await impl()
+    @_iorepeat(3, "downloading file")
+    async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None:
+        assert dl  # The function is only reached when dl is not None
+        async with dl as (bar, sink):
+            await self._stream_from_url(element.url, sink, bar, is_video=False)

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None:
        async def try_stream() -> bool:
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@ -15,13 +15,13 @@ class PferdLoadError(Exception):


 class Pferd:
-    def __init__(self, config: Config, cli_crawlers: Optional[List[str]]):
+    def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]):
        """
        May throw PferdLoadError.
        """

        self._config = config
-        self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers)
+        self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips)

        self._authenticators: Dict[str, Authenticator] = {}
        self._crawlers: Dict[str, Crawler] = {}
@ -65,16 +65,30 @@ class Pferd:

        return crawlers_to_run

-    def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]:
+    def _find_crawlers_to_run(
+            self,
+            config: Config,
+            cli_crawlers: Optional[List[str]],
+            cli_skips: Optional[List[str]],
+    ) -> List[str]:
        log.explain_topic("Deciding which crawlers to run")

+        crawlers: List[str]
        if cli_crawlers is None:
            log.explain("No crawlers specified on CLI")
            log.explain("Running crawlers specified in config")
-            return self._find_config_crawlers(config)
+            crawlers = self._find_config_crawlers(config)
        else:
            log.explain("Crawlers specified on CLI")
-            return self._find_cli_crawlers(config, cli_crawlers)
+            crawlers = self._find_cli_crawlers(config, cli_crawlers)
+
+        skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set()
+        for crawler in crawlers:
+            if crawler in skips:
+                log.explain(f"Skipping crawler {crawler!r}")
+        crawlers = [crawler for crawler in crawlers if crawler not in skips]
+
+        return crawlers

    def _load_authenticators(self) -> None:
        for name, section in self._config.auth_sections():
--- a/PFERD/transformer.py
+++ b/PFERD/transformer.py
@ -41,9 +41,11 @@ TransformResult = Optional[Union[Transformed, Ignored]]
@dataclass
 class Rule:
    left: str
+    left_index: int
    name: str
    head: ArrowHead
    right: RightSide
+    right_index: int

    def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]:
        if isinstance(self.right, str):
@ -345,6 +347,7 @@ def parse_eol(line: Line) -> None:

 def parse_rule(line: Line) -> Rule:
    parse_zero_or_more_spaces(line)
+    left_index = line.index
    left = parse_left(line)

    parse_one_or_more_spaces(line)
@ -354,19 +357,19 @@ def parse_rule(line: Line) -> Rule:
    line.expect("-")
    head = parse_arrow_head(line)

-    index = line.index
+    right_index = line.index
    right: RightSide
    try:
        parse_zero_or_more_spaces(line)
        parse_eol(line)
        right = Empty()
    except RuleParseError:
-        line.index = index
+        line.index = right_index
        parse_one_or_more_spaces(line)
        right = parse_right(line)
        parse_eol(line)

-    return Rule(left, name, head, right)
+    return Rule(left, left_index, name, head, right, right_index)


 def parse_transformation(line: Line) -> Transformation:
@ -377,6 +380,9 @@ def parse_transformation(line: Line) -> Transformation:
    elif rule.name == "exact":
        return ExactTf(rule)
    elif rule.name == "name":
+        if len(PurePath(rule.left).parts) > 1:
+            line.index = rule.left_index
+            raise RuleParseError(line, "Expected name, not multiple segments")
        return RenamingPartsTf(ExactTf(rule))
    elif rule.name == "re":
        return RenamingParentsTf(ExactReTf(rule))
--- a/PFERD/version.py
+++ b/PFERD/version.py
@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.1.0"
+VERSION = "3.2.0"
Author	SHA1	Message	Date
Joscha	742632ed8d	Bump version to 3.2.0	2021-08-04 18:27:26 +00:00
Joscha	544d45cbc5	Catch non-critical exceptions at crawler top level	2021-07-13 15:42:11 +02:00
Joscha	86f79ff1f1	Update changelog	2021-07-07 15:23:58 +02:00
I-Al-Istannen	ee67f9f472	Sort elements by ILIAS id to ensure deterministic ordering	2021-07-06 17:45:48 +02:00
I-Al-Istannen	8ec3f41251	Crawl ilias booking objects as links	2021-07-06 16:15:25 +02:00
I-Al-Istannen	89be07d4d3	Use final crawl path in HTML parsing message	2021-07-03 17:05:48 +02:00
I-Al-Istannen	91200f3684	Fix nondeterministic name deduplication	2021-07-03 12:09:55 +02:00
Joscha	9ffd603357	Error when using multiple segments with -name-> Previously, PFERD just silently never matched the -name-> arrow. Now, it errors when loading the config file.	2021-07-01 11:14:50 +02:00
Joscha	80eeb8fe97	Add --skip option	2021-07-01 11:02:21 +02:00